This document contains all statistical analyses conducted for the manuscript.
All data to reproduce analysis can be found here: https://github.com/biocom-uib/CAUvsCYM. This document uses precomputed data for heavy calculations. See README for details.

1 Setup

rm(list = ls())
knitr::opts_chunk$set(
   fig.width=10, 
  out.width="50%",
  fig.asp = 1,
  fig.align="center",
  echo = TRUE,
  message = FALSE, 
  warning = FALSE,
  hiline = TRUE,
  cache=TRUE
)
options(knitr.kable.NA = '')
knitr::opts_knit$set(global.par=TRUE)
par(cex.main=0.9,cex.axis=0.8,cex.lab=0.8)
options(scipen = 999)

We load the necessary packages:

library(readxl)
library(readr)
library(dplyr)
library(tidyr)
library(knitr)
library(ggplot2)
library(grid)
library(tidyverse)
library(splines)
library(zCompositions)
library(compositions)
library(robCompositions)
library(easyCODA)
library(ALDEx2)
library(viridis)
library(plotly)
library(ComplexHeatmap)
library(stats)
library(dendextend)
library(RColorBrewer)
library(kableExtra)
library(vegan)
library(ecolTest)
library(factoextra)
library(coda4microbiome)
library(broom)
source("funcionsCODAMETACIRCLE.R")


opar=par()
library(flextable)
set_flextable_defaults(
  font.family = "Arial", font.size = 10, 
  border.color = "gray", big.mark = "")

2 Statistical analysis of biochemical and cellular parameters

2.1 Sulfur concentrations

#Data and factors 
datos <- read_table("Sulfur_data_28ago25.txt")
datos$Species<-as.factor(datos$Species)
datos$Treatment<-as.factor(datos$Treatment)
datos$Time<-factor(datos$Time, levels = c("T0", "T2"))
#Group summary for plotting 
df_summary <- datos %>%
  group_by(Time, Species, Treatment) %>%
  summarise(
    mean_sulfur = mean(Sulfur, na.rm = TRUE),
    sd_sulfur   = sd(Sulfur, na.rm = TRUE),
    n           = n(),
    se_sulfur   = sd_sulfur / sqrt(n)
  ) %>%
  ungroup()

# --- Figure (as in the paper: means ± SE) ---
p_a<- ggplot() +
  geom_jitter(data = datos,
              aes(x = Time, y = Sulfur, color = Species, 
                  shape = Treatment),
              width = 0.1,  
              alpha = 0.4,
              size = 1.2) +
  geom_line(data = df_summary,
            aes(x = Time, 
                y = mean_sulfur, 
                color = Species, 
                group = interaction(Species, Treatment),
                linetype = Treatment),
            size = 1) +
  geom_point(data = df_summary,
             aes(x = Time, 
                 y = mean_sulfur, 
                 color = Species, 
                 shape = Treatment),
             size = 3) +
  geom_errorbar(data = df_summary,
                aes(x = Time, 
                    ymin = mean_sulfur - se_sulfur,
                    ymax = mean_sulfur + se_sulfur,
                    color = Species),
                width = 0.1,
                size = 1.2) +
  scale_color_manual(values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
  labs(
    x = "Time",
    y = expression(paste("Sulfur (", mu, "M)")),
    color = "Species",
    shape = "Treatment",
    linetype = "Treatment"
  ) +
  theme_minimal() +
  theme(axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x  = element_text(face = "bold"),
        legend.title = element_text(face = "bold")
        ) +
  coord_cartesian(ylim = c(400, 1400)) +  
  scale_y_continuous(
    breaks = seq(400, 1400, by = 100)      
  ) 

p_a

The figure shows sulfur concentrations (mean ± SE) at baseline (T0) and after exposure (T2) for sediments colonized by Caulerpa (blue) and Cymodocea (orange) under control (solid lines) and heatwave (dashed lines) conditions. In both species, sulfur levels increased from T0 to T2. The increase was evident under both control and heatwave treatments in Caulerpa, whereas in Cymodocea the rise was more pronounced under heatwave conditions. Individual data points (faded symbols) illustrate the variability within groups.

To evaluate changes in sediment sulfur concentration, we applied a nonparametric testing framework. First, we examined within-group temporal changes (T0 vs T2) separately for each Species × Treatment using Wilcoxon tests, with p-values adjusted by the Benjamini–Hochberg procedure. We then compared treatments at T2 (Control vs Heatwave) within each species to assess whether heatwave exposure altered sulfur at the final time point. This two-step approach provides both temporal (within-group) and treatment-level (between-group) perspectives on sulfur dynamics.

We first report group medians at T0 and T2 and their difference (Δ = T2 − T0) for each Species × Treatment. These summaries are descriptive and motivate the subsequent nonparametric tests.

medians <- datos %>%
  group_by(Species, Treatment, Time) %>%
  summarise(median_sulfur = median(Sulfur, na.rm = TRUE), n = dplyr::n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = Time, values_from = median_sulfur) %>%
  # Opción A: con comillas invertidas (recomendado)
  mutate(delta_median = `T2` - `T0`)
  # Opción B (alternativa robusta):
  # mutate(delta_median = get("T2") - get("T0"))

knitr::kable(medians, digits = 2, caption = "Medians by group and change (T2 - T0)")
Table 1: Medians by group and change (T2 - T0)
Species Treatment n T0 T2 delta_median
CAU Control 9 945.91 1151.37 205.46
CAU HW 9 939.62 1284.93 345.31
CYM Control 9 785.53 894.56 109.03
CYM HW 9 835.85 1109.06 273.21

We then formally tested T0 vs T2 within each Species × Treatment using Wilcoxon rank–sum tests, controlling the false discovery rate with Benjamini–Hochberg adjustment.

time_tests <- datos %>%
  dplyr::group_by(Species, Treatment) %>%
  dplyr::summarise(
    p = {
      d <- dplyr::cur_data()
      x <- d$Sulfur[d$Time == "T0"]
      y <- d$Sulfur[d$Time == "T2"]
      if (length(x) > 0 && length(y) > 0) {
        suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
      } else {
        NA_real_
      }
    },
    .groups = "drop"
  ) %>%
  dplyr::mutate(p_adj = p.adjust(p, method = "BH"))

knitr::kable(time_tests, digits = 3,
             caption = "Wilcoxon (T0 vs T2) per Species × Treatment (BH-adjusted p)")
Table 2: Wilcoxon (T0 vs T2) per Species × Treatment (BH-adjusted p)
Species Treatment p p_adj
CAU Control 0.008 0.011
CAU HW 0.001 0.002
CYM Control 0.158 0.158
CYM HW 0.004 0.007

These outputs provide a basis for the statement that sulfur increased in both species and that the rise was more pronounced under heatwave conditions—particularly for Cymodocea at T2.

2.1.1 Difference-in-Differences (DiD) models

To complement the nonparametric analyses and jointly assess the effects of time and treatment, a difference-in-differences (DiD) model was applied separately for each species. This approach makes it possible to test whether the temporal change in sulfur differs significantly between control and heatwave conditions, while accounting for the main effects of treatment and time. In particular, the interaction term (Treatment × Time) captures the differential effect attributable to the heatwave at the final time point (TF), thus providing a direct measure of the magnitude of the treatment impact in each species.

Cymodocea — sulfur analysis

cy <- datos %>%
  dplyr::filter(Species == "CYM")

cy$Treatment <- factor(cy$Treatment, levels = c("Control", "HW"))
cy$Time      <- factor(cy$Time,      levels = c("T0", "T2"))

fit_cy <- lm(Sulfur ~ Treatment + Time + Treatment:Time, data = cy)
summary(fit_cy)
## 
## Call:
## lm(formula = Sulfur ~ Treatment + Time + Treatment:Time, data = cy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -318.31 -118.50   32.97   95.49  266.60 
## 
## Coefficients:
##                    Estimate Std. Error t value           Pr(>|t|)    
## (Intercept)          720.20      56.15  12.826 0.0000000000000372 ***
## TreatmentHW           21.73      79.41   0.274             0.7861    
## TimeT2               163.62      79.41   2.061             0.0476 *  
## TreatmentHW:TimeT2   129.66     112.30   1.155             0.2568    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 168.5 on 32 degrees of freedom
## Multiple R-squared:  0.3877, Adjusted R-squared:  0.3303 
## F-statistic: 6.754 on 3 and 32 DF,  p-value: 0.001172

In Cymodocea, sulfur concentrations increased significantly from T0 to T2 under control conditions (Time effect: p = 0.048). The DiD model further estimated an additional increase under heatwave exposure at T2 (interaction coefficient = 129.7), consistent with the descriptive pattern of a more pronounced rise under heatwave conditions. However, this differential effect was not statistically significant (p = 0.26), indicating that while the data suggest a stronger increase in Cymodocea under heatwave exposure, statistical support for this effect remains moderate.

Caulerpa — sulfur analysis

ca <- datos %>%
  dplyr::filter(Species == "CAU")

ca$Treatment <- factor(cy$Treatment, levels = c("Control", "HW"))
ca$Time      <- factor(cy$Time,      levels = c("T0", "T2"))

fit_ca <- lm(Sulfur ~ Treatment + Time + Treatment:Time, data = ca)
summary(fit_ca)
## 
## Call:
## lm(formula = Sulfur ~ Treatment + Time + Treatment:Time, data = ca)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -242.770  -71.945   -7.951   83.563  226.415 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)         955.346     40.921  23.346 < 0.0000000000000002 ***
## TreatmentHW          -1.572     57.872  -0.027             0.978493    
## TimeT2              222.755     57.872   3.849             0.000534 ***
## TreatmentHW:TimeT2  114.871     81.843   1.404             0.170080    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 122.8 on 32 degrees of freedom
## Multiple R-squared:  0.6131, Adjusted R-squared:  0.5769 
## F-statistic: 16.91 on 3 and 32 DF,  p-value: 0.0000009271

In Caulerpa, sulfur concentrations increased significantly from T0 to T2 under control conditions (Time effect: p < 0.001). The DiD model further estimated an additional increase under heatwave exposure at T2 (interaction coefficient = 114.9), consistent with the descriptive pattern of greater accumulation under heatwave conditions. However, this differential effect was not statistically significant (p = 0.17), indicating that while sulfur accumulation clearly occurred over time, the evidence for an extra increase driven by heatwave exposure remains moderate.

2.2 Redox potential

#Data
datos <- read_table("redox_data_12feb25.txt")
datos$Species<-as.factor(datos$Species)
datos$Treatment<-as.factor(datos$Treatment)
datos$Time<-factor(datos$Time, levels = c("T0", "T2"))
#Group summary for plotting
df_summary <- datos %>%
  group_by(Time, Species, Treatment) %>%
  summarise(
    mean_redox = mean(redox, na.rm = TRUE),
    sd_redox   = sd(redox, na.rm = TRUE),
    n           = n(),
    se_redox   = sd_redox / sqrt(n)
  ) %>%
  ungroup()


p_b <- ggplot() +
  geom_jitter(data = datos,
              aes(x = Time, y = redox, color = Species, shape = Treatment),
              width = 0.1,  
              alpha = 0.4,
              size = 1.2) +
  
  geom_line(data = df_summary,
            aes(x = Time, 
                y = mean_redox, 
                color = Species, 
                group = interaction(Species, Treatment),
                linetype = Treatment),
            size = 1) +
  geom_point(data = df_summary,
             aes(x = Time, 
                 y = mean_redox, 
                 color = Species, 
                 shape = Treatment),
             size = 3) +
  geom_errorbar(data = df_summary,
                aes(x = Time, 
                    ymin = mean_redox - se_redox,
                    ymax = mean_redox + se_redox,
                    color = Species),
                width = 0.1,
                size = 1.2) +
  scale_color_manual(values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
  labs(
    x = "Time",
    y = expression(paste("redox (",m, "M)")) ,
    color = "Species",
    shape = "Treatment",
    linetype = "Treatment"
  ) +
  theme_minimal() + 
  theme(axis.title.x = element_text(face = "bold"),
         axis.title.y = element_text(face = "bold"),
        axis.text.x  = element_text(face = "bold"),
        legend.title = element_text(face = "bold")
        ) +
   coord_cartesian(ylim = c(-430, -330)) +  
   scale_y_continuous(
     breaks = seq(-430, -330, by = 10)      
   )
p_b

The figure shows sediment redox potential (mean ± SE) at baseline (T0) and after exposure (T2) for sediments colonized by Caulerpa (blue) and Cymodocea (orange) under control (solid lines) and heatwave (dashed lines) conditions. In Caulerpa, redox potential declined significantly from T0 to T2 under control conditions but remained stable under heatwave conditions. In Cymodocea, redox potential also decreased over time, with a stronger decline under heatwave treatment.

To evaluate changes in sediment redox potential, we applied a nonparametric testing framework. First, we examined within-group temporal changes (T0 vs T2) separately for each Species × Treatment using Wilcoxon tests, with p-values adjusted by the Benjamini–Hochberg procedure. We then compared treatments at T2 (Control vs Heatwave) within each species to assess whether heatwave exposure altered redox potential at the final time point. This two-step approach provides both temporal (within-group) and treatment-level (between-group) perspectives on redox dynamics.

We present a nonparametric analysis of sediment redox potential. First, we assess within-group change from T0 to T2 for each Species × Treatment using Wilcoxon tests and compare treatments at T2 within each species. P-values are adjusted for multiple testing with the Benjamini–Hochberg procedure

eh_within <- datos %>%
  dplyr::group_by(Species, Treatment) %>%
  dplyr::summarise(
    p = {
      d <- dplyr::cur_data()
      x <- d$redox[d$Time == "T0"]
      y <- d$redox[d$Time == "T2"]
      if (length(x) > 0 && length(y) > 0) {
        suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
      } else NA_real_
    },
    .groups = "drop"
  ) %>%
  dplyr::mutate(p_adj = p.adjust(p, method = "BH"))

knitr::kable(eh_within, digits = 3,
             caption = "Wilcoxon (T0 vs T2) by Species × Treatment (BH-adjusted p)")
Table 3: Wilcoxon (T0 vs T2) by Species × Treatment (BH-adjusted p)
Species Treatment p p_adj
CAU Control 0.005 0.020
CAU HW 0.575 0.575
CYM Control 0.298 0.397
CYM HW 0.045 0.091

These summaries show that redox potential generally decreased over time, with the largest declines occurring in Caulerpa under control conditions and in Cymodocea under heatwave conditions. These descriptive patterns motivated the subsequent nonparametric tests.

eh_t2_bt <- datos %>%
  dplyr::filter(Time == "T2") %>%
  dplyr::group_by(Species) %>%
  dplyr::summarise(
    p = {
      d <- dplyr::cur_data()
      if (dplyr::n_distinct(d$Treatment) >= 2) {
        suppressWarnings(stats::wilcox.test(redox ~ Treatment, data = d, exact = FALSE)$p.value)
      } else NA_real_
    },
    .groups = "drop"
  ) %>%
  dplyr::mutate(p_adj = p.adjust(p, method = "BH"))

knitr::kable(eh_t2_bt, digits = 3,
             caption = "Wilcoxon (Treatment) at T2 by Species (BH-adjusted p)")
Table 4: Wilcoxon (Treatment) at T2 by Species (BH-adjusted p)
Species p p_adj
CAU 0.298 0.298
CYM 0.045 0.091

Overall, redox potential tended to decrease from T0 to T2, but the magnitude and significance of these shifts varied across species and treatments. In Caulerpa, a significant decline was detected under control conditions (BH-adjusted p = 0.020), whereas no change was observed under heatwave conditions. In Cymodocea, redox potential also decreased over time, with a trend towards stronger reductions under heatwave exposure (raw p = 0.045), though this did not remain significant after correction (BH-adjusted p = 0.091). Comparisons between treatments at T2 revealed no differences for Caulerpa and only a marginal effect for Cymodocea (BH-adjusted p = 0.091).

2.2.1 Difference-in-Differences (DiD) models

We fitted difference-in-differences (DiD) models for each species. This framework evaluates whether changes in redox potential from T0 to T2 differed between control and heatwave conditions, while simultaneously accounting for the main effects of treatment and temporal progression. The key parameter of interest is the interaction term (Treatment × Time), which quantifies the additional effect attributable to heatwave exposure at the final sampling point (T2), thereby offering a direct estimate of the treatment-specific impact.

Cymodocea — redox analysis

fit_cy <- lm(Redox ~ Treatment + Time + Treatment:Time, data = cy)
summary(fit_cy)
## 
## Call:
## lm(formula = Redox ~ Treatment + Time + Treatment:Time, data = cy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.133 -13.897  -2.578  10.992  58.667 
## 
## Coefficients:
##                    Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)        -367.567      6.886 -53.382 <0.0000000000000002 ***
## TreatmentHW           3.100      9.738   0.318              0.7523    
## TimeT2              -20.711      9.738  -2.127              0.0412 *  
## TreatmentHW:TimeT2  -19.089     13.771  -1.386              0.1753    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.66 on 32 degrees of freedom
## Multiple R-squared:  0.4086, Adjusted R-squared:  0.3531 
## F-statistic: 7.368 on 3 and 32 DF,  p-value: 0.000689

In Cymodocea, redox potential declined significantly from T0 to T2 under control conditions (Time effect: –20.7, p = 0.041). The DiD model further estimated an additional reduction under heatwave exposure at T2 (interaction coefficient = –19.1), consistent with the descriptive observation of stronger declines under heatwave conditions. However, this interaction was not statistically significant (p = 0.18), indicating that while the data suggest a heatwave-driven intensification of the temporal decline in redox potential, the statistical support for this effect remains limited.

Caulerpa — redox analysis

fit_ca <- lm(Redox ~ Treatment + Time + Treatment:Time, data = ca)
summary(fit_ca)
## 
## Call:
## lm(formula = Redox ~ Treatment + Time + Treatment:Time, data = ca)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28.3444 -11.2625   0.1944   7.6500  30.1556 
## 
## Coefficients:
##                    Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)        -392.850      4.974 -78.977 <0.0000000000000002 ***
## TreatmentHW         -13.794      7.035  -1.961              0.0586 .  
## TimeT2               -6.506      7.035  -0.925              0.3620    
## TreatmentHW:TimeT2    1.817      9.948   0.183              0.8563    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.92 on 32 degrees of freedom
## Multiple R-squared:  0.2002, Adjusted R-squared:  0.1252 
## F-statistic:  2.67 on 3 and 32 DF,  p-value: 0.06413

In Caulerpa, redox potential showed no significant temporal change from T0 to T2 under control conditions (Time effect: –6.5, p = 0.36). The model estimated a slightly lower baseline under heatwave compared to control at T0 (Treatment effect: –13.8, p = 0.059), suggesting a marginal initial difference between treatments. However, the interaction term (Treatment × Time) was small and not significant (p = 0.86), indicating that heatwave exposure did not alter the temporal trajectory of redox potential. Overall, these results suggest that, unlike Cymodocea, the decline in redox observed in Caulerpa sediments was not exacerbated by heatwave conditions.

2.3 Cellular parameters

We analyzed cell counts in sediments colonized by either Caulerpa or Cymodocea under control and heatwave conditions, assessing temporal changes (T0–T1–T2) within each Species × Treatment group as well as differences between species.

#Data
datos <- read_table("cell_shanon_data_12_05_25.txt") %>% 
  mutate(
    Species   = factor(Species),
    Time      = factor(Time, levels = c("T0","T1","T2")),
    Treatment = factor(Treatment),
    cells     = as.numeric(cells)
  )

#Group summary for plotting
summary_data <- datos %>%
  group_by(Species, Treatment, Time) %>%
  summarise(
    mean_cells = mean(cells, na.rm = TRUE),
    sd         = sd(cells,   na.rm = TRUE),
    n          = n(),
    se         = sd / sqrt(n),
    .groups    = "drop"
  )

t1_hw <- summary_data %>% 
  filter(Treatment == "HW", Time == "T1") %>%
  select(Species, y    = mean_cells)
t2_hw <- summary_data %>% 
  filter(Treatment == "HW", Time == "T2") %>%
  select(Species, yend = mean_cells)
segments_hw <- left_join(t1_hw, t2_hw, by = "Species")

t12     <- summary_data %>% filter(Time %in% c("T1","T2"))
rango12 <- range(t12$mean_cells - t12$se, t12$mean_cells + t12$se)
ymin12  <- rango12[1] - 0.025 * diff(rango12)
ymax12  <- rango12[2] + 0.025 * diff(rango12)


p_c<- ggplot() +
  geom_jitter(
    data  = datos,
    aes(x = Time, y = cells, color = Species, shape = Treatment),
    width = 0.1, alpha = 0.4, size = 1.2
  ) +
  geom_line(
    data = summary_data,
    aes(x = Time, y = mean_cells,
        color    = Species,
        linetype = Treatment,
        group    = interaction(Species, Treatment)),
    size = 1
  ) +
  geom_point(
    data  = summary_data,
    aes(x = Time, y = mean_cells, color = Species, shape = Treatment),
    size = 3
  ) +
  geom_errorbar(
    data = summary_data,
    aes(x = Time,
        ymin  = mean_cells - se,
        ymax  = mean_cells + se,
        color = Species),
    width = 0.2, size = 1.25
  ) +
  geom_segment(
    data      = summary_data,
    aes(y     = 53800000,
        yend  = 38700000,
        color = "CYM"),
    x      = 2,      
    xend   = 3,     
    linetype = "dashed",
    size      = 1.5
  ) +
  geom_segment(
    data      = summary_data,
    aes(y     = 173333333,
        yend  = 212000000,
        color = "CAU"),
    x      = 2,     
    xend   = 3,     
    linetype = "dashed",
    size      = 1.5
  ) +
  scale_color_manual(name   = "Species",
                     values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
  scale_shape_manual(name   = "Treatment",
                     values = c("Control" = 16,  # circles filled
                                "HW"      = 17)) +# triangles filled
  scale_linetype_manual(name   = "Treatment",
                        values = c("Control" = "solid",
                                   "HW"      = "dashed")) +
  guides(
    shape    = guide_legend(override.aes = list(size = 3)),
    linetype = guide_legend(override.aes = list(size = 1.5))
  ) +
  labs(
    x        = "Time",
    y        = "Cells",
    color    = "Species",
    shape    = "Treatment",
    linetype = "Treatment"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.title.x = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    axis.text.x  = element_text(face = "bold"),
    legend.title = element_text(face = "bold")
  ) +
  coord_cartesian(ylim = c(ymin12, ymax12)) +
  scale_y_continuous(
    breaks = pretty(c(ymin12, ymax12), n = 5),
    labels = expression(0,5%*%10^7,1%*%10^8,1.5%*%10^8,2%*%10^8,2.5%*%10^8),
    expand = expansion(mult = c(0,0))
  )

p_c

The figure shows cell counts in sediments colonized by Caulerpa (blue) and Cymodocea (orange) under control (solid lines) and heatwave (dashed lines) conditions across three time points (T0, T1, T2). In Caulerpa, cell numbers remained relatively stable from T0 to T1, followed by a marked increase under heatwave conditions at T2, while control samples decreased slightly. In contrast, Cymodocea exhibited consistently lower cell counts overall, with values remaining stable between T0 and T1 but declining toward T2 under both treatments, particularly under heatwave exposure.

We first quantified the overall range of cell counts observed in each species to provide context for the magnitude of the differences.

ranges <- datos %>%
  group_by(Species) %>%
  summarise(
    min_cells = min(cells, na.rm = TRUE),
    max_cells = max(cells, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(min_x1e9 = round(min_cells / 1e9, 1),
         max_x1e9 = round(max_cells / 1e9, 1))

knitr::kable(ranges, digits = 2, caption = "Species-wise ranges (×10^9 cells/g)")
Table 5: Species-wise ranges (×10^9 cells/g)
Species min_cells max_cells min_x1e9 max_x1e9
CAU 116000000 221000000 0.1 0.2
CYM 19800000 61900000 0.0 0.1

These ranges highlight the broader scale of variation in Caulerpa compared with Cymodocea, providing a baseline for interpreting subsequent tests.

To evaluate species-level differences, we first applied a Wilcoxon rank–sum test pooling across all time points and treatments.

# Overall (pooling all times/treatments)
bt_overall <- suppressWarnings(stats::wilcox.test(cells ~ Species, data = datos, exact = FALSE))

# By time (T0, T1, T2) — useful to show consistency
bt_by_time <- datos %>%
  group_by(Time) %>%
  summarise(
    p = {
      d <- cur_data()
      suppressWarnings(stats::wilcox.test(d$cells ~ d$Species, exact = FALSE)$p.value)
    },
    .groups = "drop"
  )

knitr::kable(bt_by_time, digits = 3, caption = "Between-species Wilcoxon by Time")
Table 6: Between-species Wilcoxon by Time
Time p
T0 0.081
T1 0.081
T2 0.005
bt_overall$p.value
## [1] 0.00003658455

This overall comparison indicated that species differ in cell counts.

To assess whether cell counts changed over time within each Species × Treatment combination, we first applied a nonparametric global test across the three sampling points (T0, T1, T2).

# Kruskal–Wallis per group
kw_time <- datos %>%
  group_by(Species, Treatment) %>%
  summarise(
    kw_p = {
      d <- cur_data()
      if (n_distinct(d$Time) >= 2) {
        suppressWarnings(kruskal.test(cells ~ Time, data = d)$p.value)
      } else NA_real_
    },
    .groups = "drop"
  )

# Pairwise Wilcoxon per group (BH across the 3 pairwise contrasts within each group)
pw_time <- datos %>%
  group_by(Species, Treatment) %>%
  group_modify(~{
    d <- .x
    # Need at least 2 time levels
    if (n_distinct(d$Time) < 2) return(tibble())
    pw <- pairwise.wilcox.test(d$cells, d$Time, p.adjust.method = "BH", exact = FALSE)
    # Convert p-value matrix to long
    M <- as.data.frame(as.table(pw$p.value))
    names(M) <- c("Time1","Time2","p_adj")
    M <- M %>% filter(!is.na(p_adj))
    # Add direction using medians
    meds <- d %>% group_by(Time) %>% summarise(med = median(cells, na.rm = TRUE), .groups="drop")
    M <- M %>%
      left_join(meds, by = c("Time1" = "Time")) %>% rename(med1 = med) %>%
      left_join(meds, by = c("Time2" = "Time")) %>% rename(med2 = med) %>%
      mutate(direction = case_when(
        med2 > med1 ~ "increase",
        med2 < med1 ~ "decrease",
        TRUE ~ "no_change"
      ))
    M
  }) %>%
  ungroup()

knitr::kable(kw_time, digits = 3, caption = "Kruskal–Wallis across T0–T1–T2 by Species × Treatment")
Table 7: Kruskal–Wallis across T0–T1–T2 by Species × Treatment
Species Treatment kw_p
CAU Control 0.491
CAU HW
CYM Control 0.193
CYM HW
knitr::kable(pw_time, digits = 3, caption = "Pairwise Wilcoxon (BH-adjusted) and median directions")
Table 7: Pairwise Wilcoxon (BH-adjusted) and median directions
Species Treatment Time1 Time2 p_adj med1 med2 direction
CAU Control T1 T0 0.663 168000000 159000000 decrease
CAU Control T2 T0 0.663 148000000 159000000 increase
CAU Control T2 T1 0.663 148000000 168000000 increase
CYM Control T1 T0 1.000 52400000 54000000 increase
CYM Control T2 T0 0.286 26500000 54000000 increase
CYM Control T2 T1 0.286 26500000 52400000 increase

Although the nonparametric tests did not detect statistically significant within-group changes, the descriptive patterns in the figures suggest some biologically meaningful trends. In particular, Cymodocea under control conditions showed a consistent rise in median cell counts from T0 to T2, and Caulerpa exhibited fluctuations that point to a potential increase by the final time point. The absence of statistical significance likely reflects the limited sample size and variability within groups, rather than a complete lack of temporal dynamics. Thus, while the formal tests provide little evidence for strong within-group effects, the graphical trends indicate subtle trajectories that may become clearer with larger sample sizes or longer experimental follow-up.

3 Biodiversity

Shannon diversity indices were computed with the diversity function in vegan and summarized by Species × Treatment × Time. We first visualized mean trajectories (±SE) together with individual observations to inspect temporal patterns and treatment contrasts.

library(readr)
library(dplyr)
library(ggplot2)
library(grid)    

datos <- read_table("cell_shanon_data_12_05_25.txt") %>% 
  mutate(
    Species   = factor(Species),
    Time      = factor(Time, levels = c("T0","T1","T2")),
    Treatment = factor(Treatment),
    shanon     = as.numeric(shanon)
  )

summary_data <- datos %>%
  group_by(Species, Treatment, Time) %>%
  summarise(
    mean_shanon = mean(shanon, na.rm = TRUE),
    sd         = sd(shanon,   na.rm = TRUE),
    n          = n(),
    se         = sd / sqrt(n),
    .groups    = "drop"
  )


t1_hw <- summary_data %>% 
  filter(Treatment == "HW", Time == "T1") %>%
  select(Species, y    = mean_shanon)
t2_hw <- summary_data %>% 
  filter(Treatment == "HW", Time == "T2") %>%
  select(Species, yend = mean_shanon)
segments_hw <- left_join(t1_hw, t2_hw, by = "Species")


t12     <- summary_data %>% filter(Time %in% c("T1","T2"))
rango12 <- range(t12$mean_shanon - t12$se, t12$mean_shanon + t12$se)
ymin12  <- rango12[1] - 0.55 * diff(rango12)
ymax12  <- rango12[2] + 0.25 * diff(rango12)


p_d<- ggplot() +
  geom_jitter(
    data  = datos,
    aes(x = Time, y = shanon, color = Species, shape = Treatment),
    width = 0.1, alpha = 0.4, size = 1.2
  ) +
  geom_line(
    data = summary_data,
    aes(x = Time, y = mean_shanon,
        color    = Species,
        linetype = Treatment,
        group    = interaction(Species, Treatment)),
    size = 1
  ) +
  geom_point(
    data  = summary_data,
    aes(x = Time, y = mean_shanon, color = Species, shape = Treatment),
    size = 3
  ) +
  geom_errorbar(
    data = summary_data,
    aes(x = Time,
        ymin  = mean_shanon - se,
        ymax  = mean_shanon + se,
        color = Species),
    width = 0.2, size = 1.25
  ) +
  geom_segment(
    data      = summary_data,
    aes(y     = 5.072667,
        yend  = 5.056000,
        color = "CYM"),
    x      = 2,     
    xend   = 3,      
    linetype = "dashed",
    size      = 1.5
  ) +
  geom_segment(
    data      = summary_data,
    aes(y     = 5.201667,
        yend  = 5.180333,
        color = "CAU"),
    x      = 2,     
    xend   = 3,      
    linetype = "dashed",
    size      = 1.5
  ) +
  scale_color_manual(name   = "Species",
                     values = c("CAU" = "#008ae6", "CYM" = "#ff751a")) +
  scale_shape_manual(name   = "Treatment",
                     values = c("Control" = 16,  # circles filled
                                "HW"      = 17)) +# triangles filled
  scale_linetype_manual(name   = "Treatment",
                        values = c("Control" = "solid",
                                   "HW"      = "dashed")) +
  guides(
    shape    = guide_legend(override.aes = list(size = 3)),
    linetype = guide_legend(override.aes = list(size = 1.5))
  ) +
  labs(
    x        = "Time",
    y        = "Shannon diversity index",
    color    = "Species",
    shape    = "Treatment",
    linetype = "Treatment"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.title.x = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    axis.text.x  = element_text(face = "bold"),
    legend.title = element_text(face = "bold")
  ) +
  coord_cartesian(ylim = c(ymin12, ymax12)) +
  scale_y_continuous(
    breaks = pretty(c(ymin12, ymax12), n = 5),
    expand = expansion(mult = c(0,0))
  ) 
p_d

The plot shows that Caulerpa maintains relatively high and stable diversity across the experiment, with only slight changes between T1 and T2. Cymodocea exhibits a marked rise from T0 to T1, followed by a modest leveling or decline toward T2 under heatwave exposure, while control samples remain comparatively stable. These descriptive trends motivated formal between-species comparisons.

To test whether species differed in Shannon diversity, we applied Wilcoxon rank–sum tests overall (pooling time points) and separately at each sampling time.

## Overall (all times + treatments pooled)
bt_overall <- suppressWarnings(stats::wilcox.test(shanon ~ Species, data = datos, exact = FALSE))
bt_overall_p <- bt_overall$p.value

## By time (T0, T1, T2)
bt_by_time <- datos %>%
  group_by(Time) %>%
  summarise(
    med_CAU = median(shanon[Species == "CAU"], na.rm = TRUE),
    med_CYM = median(shanon[Species == "CYM"], na.rm = TRUE),
    p = {
      d <- cur_data()
      suppressWarnings(stats::wilcox.test(d$shanon ~ d$Species, exact = FALSE)$p.value)
    },
    .groups = "drop"
  )

kable(bt_by_time, digits = 3, caption = "Between-species Wilcoxon by time (Shannon)")
Table 8: Between-species Wilcoxon by time (Shannon)
Time med_CAU med_CYM p
T0 5.188 4.927 0.081
T1 5.198 5.056 0.383
T2 5.207 5.080 0.173
bt_overall_p
## [1] 0.01300303

The overall Wilcoxon test indicated a species effect across time points (report printed bt_overall_p). By-time Wilcoxon tests (unadjusted p-values as printed in the table) suggest a marginal difference at T0, with higher diversity in Caulerpa, whereas differences at T1 and T2 were not statistically significant. Taken together, the figure and tests indicate that Caulerpa generally sustains higher Shannon diversity than Cymodocea, while temporal dynamics—especially the T0→T1 increase in Cymodocea and the slight T1→T2 softening under heatwave—are modest in magnitude.

3.0.1 Acclimation change (T0 → T1) within species

We next examined acclimation effects by directly comparing Shannon diversity between T0 and T1 for each species.

acclim <- datos %>%
  group_by(Species) %>%
  summarise(
    med_T0 = median(shanon[Time == "T0"], na.rm = TRUE),
    med_T1 = median(shanon[Time == "T1"], na.rm = TRUE),
    p = {
      x <- shanon[Time == "T0"]; y <- shanon[Time == "T1"]
      if (length(x) > 0 && length(y) > 0)
        suppressWarnings(stats::wilcox.test(x, y, exact = FALSE)$p.value)
      else NA_real_
    },
    direction = case_when(med_T1 > med_T0 ~ "increase",
                          med_T1 < med_T0 ~ "decrease",
                          TRUE            ~ "no_change"),
    .groups = "drop"
  )

kable(acclim, digits = 3, caption = "T0 vs T1 (acclimation) by species")
Table 9: T0 vs T1 (acclimation) by species
Species med_T0 med_T1 p direction
CAU 5.188 5.198 0.663 increase
CYM 4.927 5.056 0.383 increase

During the acclimation period (T0 → T1), Shannon diversity showed a slight increase in both Caulerpa and Cymodocea sediments (median values rising from 5.19 to 5.20 and from 4.93 to 5.06, respectively). However, these changes were not statistically significant (p = 0.66 for Caulerpa, p = 0.38 for Cymodocea), indicating that although diversity tended to rise during acclimation, the magnitude of the increase was modest and within the range of sampling variability.

3.1 Temporal change under Control (T0–T1–T2)

To assess whether Shannon diversity varied over time in the absence of heatwave stress, we applied Kruskal–Wallis tests separately for Caulerpa and Cymodocea under control conditions. This approach evaluates overall differences across the three sampling times (T0, T1, T2) without assuming normality.

datos <- read_table("cell_shanon_data_12_05_25_complete.txt") %>% 
  mutate(
    Species   = factor(Species),
    Time      = factor(Time, levels = c("T0","T1","T2")),
    Treatment = factor(Treatment),
    cells     = as.numeric(cells)
  )

kw_ctrl <- datos %>%
  filter(Treatment == "Control") %>%
  group_by(Species) %>%
  summarise(
    kw_p = {
      d <- cur_data()
      suppressWarnings(kruskal.test(shanon ~ Time, data = d)$p.value)
    },
    .groups = "drop"
  )

kable(kw_ctrl, digits = 3, caption = "Kruskal–Wallis test across T0–T1–T2 under control conditions (by species)")
Table 10: Kruskal–Wallis test across T0–T1–T2 under control conditions (by species)
Species kw_p
CAU 0.733
CYM 0.432

The results (Table) show no significant temporal variation in Shannon diversity under control conditions (p = 0.733 for Caulerpa, p = 0.432 for Cymodocea). This indicates that, in the absence of heatwave exposure, microbial diversity remained relatively stable across the course of the experiment in both host species.

3.2 Temporal change under Heatwave (T0- T1–T2)

To evaluate whether heatwave exposure altered the temporal trajectory of Shannon diversity, we applied Kruskal–Wallis tests across the three time points within each species.

kw_hw <- datos %>%
  filter(Treatment == "HW") %>%
  group_by(Species) %>%
  summarise(
    kw_p = {
      d <- cur_data()
      suppressWarnings(kruskal.test(shanon ~ Time, data = d)$p.value)
    },
    .groups = "drop"
  )

kable(kw_hw, digits = 3, caption = "Kruskal–Wallis across T0–T1–T2 under heatwave conditions (by species)")
Table 11: Kruskal–Wallis across T0–T1–T2 under heatwave conditions (by species)
Species kw_p
CAU 0.875
CYM 0.288

Under heatwave conditions, no significant temporal changes were detected in either species (Caulerpa: p = 0.875; Cymodocea: p = 0.288). These results indicate that, while the figure suggests a slight T1→T2 softening in Cymodocea under heatwave exposure, the magnitude of the shift was insufficient to reach statistical significance. Combined with the control analysis, this supports the view that Shannon diversity remained broadly stable over time in both species, with only modest, non-significant fluctuations under heat stress.

3.3 Caulerpa control averages between T1 and T2

To probe short-interval changes under control conditions, we compared Shannon diversity between T1 and T2 for Caulerpa using descriptive summaries and a Wilcoxon test as a robust alternative to the Welch t-test.

target_var <- "shanon"

dat <- datos %>%
  filter(Species == "CAU", Treatment == "Control", Time %in% c("T1","T2")) %>%
  select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
  filter(!is.na(value))

summary_tbl <- dat %>%
  group_by(Time) %>%
  summarise(
    n    = dplyr::n(),
    mean = mean(value),
    sd   = sd(value),
    se   = sd/sqrt(n),
    .groups = "drop"
  )

knitr::kable(summary_tbl, digits = 3,
             caption = "Caulerpa — Control: comparison of Shannon diversity between T1 and T2 (mean, SD, SE, n)")
Table 12: Caulerpa — Control: comparison of Shannon diversity between T1 and T2 (mean, SD, SE, n)
Time n mean sd se
T1 3 5.202 0.075 0.043
T2 3 5.195 0.047 0.027
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
## 
##  Welch Two Sample t-test
## 
## data:  value by Time
## t = 0.13793, df = 3.353, p-value = 0.8982
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
##  -0.1453022  0.1593022
## sample estimates:
## mean in group T1 mean in group T2 
##         5.201667         5.194667
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  value by Time
## W = 5, p-value = 1
## alternative hypothesis: true location shift is not equal to 0

The summary table shows nearly identical means at T1 and T2 (5.202 vs 5.195), with small standard errors. Welch’s test confirmed no evidence of a difference (t = 0.138, df = 3.35, p = 0.898; 95% CI for the mean difference: −0.145 to 0.159). Results were consistent with the Welch test, showing no evidence of a difference between time points (Wilcoxon p = 1). Given the very small sample size (n = 3 per time), these findings should be interpreted as compatible with negligible change rather than proof of no effect. Thus, Shannon diversity in Caulerpa remained stable between T1 and T2 under control conditions.

3.4 Caulerpa heatwave averages between T1 and T2

To check short-interval changes under heatwave conditions, we compared Shannon diversity between T1 and T2 for Caulerpa using descriptive summaries and a Wilcoxon test as a robust alternative to the Welch t-test.

dat <- datos %>%
  filter(Species == "CAU", Treatment == "HW", Time %in% c("T1","T2")) %>%
  select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
  filter(!is.na(value))

summary_tbl <- dat %>%
  group_by(Time) %>%
  summarise(
    n    = dplyr::n(),
    mean = mean(value),
    sd   = sd(value),
    se   = sd/sqrt(n),
    .groups = "drop"
  )

knitr::kable(summary_tbl, digits = 3,
             caption = "Caulerpa – HW: T1 vs T2 (mean, SD, n)")
Table 13: Caulerpa – HW: T1 vs T2 (mean, SD, n)
Time n mean sd se
T1 3 5.202 0.075 0.043
T2 3 5.180 0.139 0.080
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
## 
##  Welch Two Sample t-test
## 
## data:  value by Time
## t = 0.23467, df = 3.0672, p-value = 0.8293
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
##  -0.2644285  0.3070952
## sample estimates:
## mean in group T1 mean in group T2 
##         5.201667         5.180333
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  value by Time
## W = 4, p-value = 1
## alternative hypothesis: true location shift is not equal to 0

The summary table shows very similar means at T1 and T2 (5.202 vs 5.180) with small standard errors. Both Welch’s test (t = 0.235, df = 3.07, p = 0.829; 95% CI for the mean difference: –0.264 to 0.308) and the Wilcoxon rank-sum test (W = 4, p = 1.00) indicated no evidence of a difference between the two time points. Thus, under heatwave conditions, Caulerpa diversity remained essentially stable between T1 and T2. Given the small sample size (n = 3 per time), these results should be interpreted as consistent with a negligible effect rather than proof of no effect.

3.5 Cymodocea control averages between T1 and T2

dat <- datos %>%
  filter(Species == "CYM", Treatment == "Control", Time %in% c("T1","T2")) %>%
  select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
  filter(!is.na(value))

summary_tbl <- dat %>%
  group_by(Time) %>%
  summarise(
    n    = dplyr::n(),
    mean = mean(value),
    sd   = sd(value),
    se   = sd/sqrt(n),
    .groups = "drop"
  )

knitr::kable(summary_tbl, digits = 3,
             caption = "Cymodocea – Control: T1 vs T2 (mean, SD, n)")
Table 14: Cymodocea – Control: T1 vs T2 (mean, SD, n)
Time n mean sd se
T1 3 5.073 0.165 0.095
T2 3 5.084 0.203 0.117
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
## 
##  Welch Two Sample t-test
## 
## data:  value by Time
## t = -0.077316, df = 3.8365, p-value = 0.9422
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
##  -0.4377554  0.4144221
## sample estimates:
## mean in group T1 mean in group T2 
##         5.072667         5.084333
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  value by Time
## W = 4.5, p-value = 1
## alternative hypothesis: true location shift is not equal to 0

The summary statistics indicate that Cymodocea maintained very similar Shannon diversity values between T1 (mean = 5.073, SE = 0.095) and T2 (mean = 5.084, SE = 0.117). Both Welch’s two-sample t-test (t = –0.077, df = 3.84, p = 0.942) and the Wilcoxon rank-sum test (W = 5, p = 1.00) confirmed the absence of significant differences between the two time points. These findings suggest that under control conditions, Cymodocea diversity remained stable across the short interval from T1 to T2, with no detectable changes despite minor variation in the group means.

3.6 Cymodocea heatwave averages between T1 and T2

dat <- datos %>%
  filter(Species == "CYM", Treatment == "HW", Time %in% c("T1","T2")) %>%
  select(ID = dplyr::any_of("ID"), Time, value = all_of(target_var)) %>%
  filter(!is.na(value))

summary_tbl <- dat %>%
  group_by(Time) %>%
  summarise(
    n    = dplyr::n(),
    mean = mean(value),
    sd   = sd(value),
    se   = sd/sqrt(n),
    .groups = "drop"
  )

knitr::kable(summary_tbl, digits = 3,
             caption = "Cymodocea – HW: T1 vs T2 (mean, SD, n)")
Table 15: Cymodocea – HW: T1 vs T2 (mean, SD, n)
Time n mean sd se
T1 3 5.073 0.165 0.095
T2 3 5.056 0.106 0.061
tt_welch <- t.test(value ~ Time, data = dat, var.equal = FALSE)
tt_welch
## 
##  Welch Two Sample t-test
## 
## data:  value by Time
## t = 0.14762, df = 3.4062, p-value = 0.891
## alternative hypothesis: true difference in means between group T1 and group T2 is not equal to 0
## 95 percent confidence interval:
##  -0.3195780  0.3529113
## sample estimates:
## mean in group T1 mean in group T2 
##         5.072667         5.056000
w_test <- stats::wilcox.test(value ~ Time, data = dat, exact = FALSE)
w_test
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  value by Time
## W = 4, p-value = 1
## alternative hypothesis: true location shift is not equal to 0

The descriptive statistics for Cymodocea under heatwave conditions show nearly identical Shannon diversity means at T1 (5.073, SE = 0.095) and T2 (5.056, SE = 0.061). Welch’s two-sample t-test found no evidence of a difference (t = 0.148, df = 3.41, p = 0.891; 95% CI: –0.320 to 0.352), and the Wilcoxon rank-sum test likewise indicated no shift in location (W = 5, p = 1.00).

4 Taxa biodiversity analysis

Datos.Muestras=data.frame(read_excel("TablaOTUs.xlsx"))
Samples.original=Datos.Muestras$ID 
DF.0.original=Datos.Muestras[,-c(1,2)]
row.names(DF.0.original)=Samples.original
OTUs=paste("OTU",1:dim(DF.0.original)[2],sep="")
names(DF.0.original)=OTUs

Sizes=rowSums(DF.0.original)

Tipos.original=as.factor(rep(c("CAU_T0","CAU_T1","CAU_T2C", "CAU_T2HW","CYM_T0","CYM_T1", "CYM_T2C","CYM_T2HW"),each=3))

4.1 Comparison of Shannon indices indices of sample types

We have quantified the biodiversity of every type of samples CAU_T* or CYM_T* by means of the Shannon index of the sample obtained by merging the 3 samples of that type. To test which pairs of sample types had significantly different Shannon indices, we have used the Hutcheson t-test, as implemented in function Hutcheson_t_test of the R package ecolTest, and adjusted the p-values with the Holm method.

# Shannon index
SH=function(x){
  vegan::diversity(x,index = "shannon")
}

p.lnp2=function(p){
  if (p==0){return(0)}
  else {
    return(p*(log(p)^2))
  }
}

# Hutcheson variance estimator
Var.SH=function(x){
N=sum(x)
y=sapply(x/sum(x),FUN=p.lnp2)
bit=sapply(x,FUN=function(x){min(x,1)})
S=sum(bit)
vv=(sum(y)-SH(x)^2)/N+(S-1)/(2*N^2)
return(vv)
}

# Standard deviation estimator
sd.SH=function(x){
  sqrt(Var.SH(x))
}
Samples=Samples.original
DF.0=DF.0.original
Tipos=Tipos.original
DF.0.agrup=aggregate(DF.0,by=list(Tipos),FUN=sum)[,-1]

For each sample, we compute its Shannon index and an estimation of its standard deviation (sd):

Resultados=data.frame(Samples,
round(apply(DF.0,MARGIN=1,SH),4),
round(apply(DF.0,MARGIN=1,sd.SH),4))
row.names(Resultados)=NULL
names(Resultados)=c("Samples","SH","sd")

flextable(Resultados)|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Samples

SH

sd

CAU_T0_1

5.2297

0.0099

CAU_T0_2

5.0583

0.0083

CAU_T0_3

5.1884

0.0108

CAU_T1_1

5.2781

0.0098

CAU_T1_2

5.1297

0.0099

CAU_T1_3

5.1981

0.0104

CAU_T2C_1

5.1477

0.0102

CAU_T2C_2

5.1980

0.0096

CAU_T2C_3

5.2405

0.0109

CAU_T2HW_1

5.2974

0.0072

CAU_T2HW_2

5.0278

0.0085

CAU_T2HW_3

5.2167

0.0078

CYM_T0_1

4.8018

0.0083

CYM_T0_2

4.9936

0.0098

CYM_T0_3

4.9272

0.0081

CYM_T1_1

4.9171

0.0052

CYM_T1_2

5.0564

0.0056

CYM_T1_3

5.2460

0.0065

CYM_T2C_1

5.3003

0.0063

CYM_T2C_2

5.0557

0.0063

CYM_T2C_3

4.8975

0.0066

CYM_T2HW_1

4.9353

0.0067

CYM_T2HW_2

5.1045

0.0064

CYM_T2HW_3

5.1299

0.0066

The next graphic depicts, for each sample, its Shannon index \(SH\) and the error bar \(SH\pm 2\cdot sd\).

row.names(Resultados)=Samples

plot(0.1*(1:24),Resultados$SH,
     col=rep(c(rep("green",3),rep("blue",3),rep("purple",3),rep("red",3)),2), 
     pch=c(rep(20,12),rep(18,12)),cex=1,
     xlab="",ylab="Shannon index",xaxt='n',
     main="All samples")
legend("bottomleft",col=c("black","black","green","blue","purple","red"),
       pch=c(20,18,NA,NA,NA,NA),
       lty=c(NA,NA,1,1,1,1),
       legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
       seg.len=0.5,cex=0.5)
segments(0.1*(1:24),Resultados$SH-2*Resultados$sd,
         0.1*(1:24),Resultados$SH+2*Resultados$sd,
         col=rep(c(rep("green",3),rep("blue",3),rep("purple",3),rep("red",3)),2),lwd=1.5)

We repeat the process for sample types: for each sample type, we merge all three samples of that type into a single sample. We include the intervals (SH-3·sd,SH+3·sd). The reason for the factor 3 is that there are 36 pairs of samples to compare, and hence we use as factor for sd the 0.951/36 quantile of the standard normal distribution. In this way, disjoint intervals give statistical evidence (at the 5% signification level) that the corresponding two indices are different.

Resultados.agrupados=data.frame(levels(Tipos),
round(apply(DF.0.agrup,MARGIN=1,SH),4),
round(apply(DF.0.agrup,MARGIN=1,sd.SH),4),
round(apply(DF.0.agrup,MARGIN=1,SH)-3*apply(DF.0.agrup,MARGIN=1,sd.SH),4),
round(apply(DF.0.agrup,MARGIN=1,SH)+3*apply(DF.0.agrup,MARGIN=1,sd.SH),4)
)
row.names(Resultados.agrupados)=NULL
names(Resultados.agrupados)=c("Samples","SH","sd","SH-3sd","SH+3sd")

flextable(Resultados.agrupados)|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Samples

SH

sd

SH-3sd

SH+3sd

CAU_T0

5.2009

0.0056

5.1842

5.2176

CAU_T1

5.2607

0.0059

5.2428

5.2785

CAU_T2C

5.2590

0.0060

5.2410

5.2770

CAU_T2HW

5.2457

0.0046

5.2319

5.2594

CYM_T0

4.9321

0.0051

4.9168

4.9474

CYM_T1

5.1190

0.0034

5.1089

5.1291

CYM_T2C

5.1226

0.0038

5.1113

5.1338

CYM_T2HW

5.0875

0.0039

5.0759

5.0991

plot(0.1*(1:8),Resultados.agrupados$SH,col=rep(c("green","blue","purple","red"),2),
     pch=c(rep(20,4),rep(18,4)),
     xlab="",ylab="Shannon index",xaxt='n',
     main="All sample types")
segments(0.1*(1:8),Resultados.agrupados$SH-3*Resultados.agrupados$sd,
         0.1*(1:8),Resultados.agrupados$SH+3*Resultados.agrupados$sd,
         col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("bottomleft",col=c("black","black","green","blue","purple","red"),
       pch=c(20,18,NA,NA,NA,NA),
       lty=c(NA,NA,1,1,1,1),
      legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
       seg.len=0.5,cex=0.5)

SH indices of CAU samples are significantly different from (and actually significantly larger than) those of CYM samples.

plot(0.1*(1:4),Resultados.agrupados$SH[1:4],col=c("green","blue","purple","red"),
     pch=20,main="CAU sample types",
     xlab="",ylab="Shannon index",xaxt='n',
     ylim=c(min(Resultados.agrupados$SH[1:4]-3*Resultados.agrupados$sd[1:4]),
              max(Resultados.agrupados$SH[1:4]+3*Resultados.agrupados$sd[1:4])))
segments(0.1*(1:4),Resultados.agrupados$SH[1:4]-3*Resultados.agrupados$sd[1:4],
         0.1*(1:4),Resultados.agrupados$SH[1:4]+3*Resultados.agrupados$sd[1:4],
         col=c("green","blue","purple","red"),lwd=1.5)
legend("topright",col=c("green","blue","purple","red"),
       pch=20,legend=c("T0","T1","T2C","T2HW"),seg.len=0.5,cex=0.5)

Only the SH index of T0 samples seems to be significantly different from the others.

plot(0.1*(1:4),Resultados.agrupados$SH[5:8],col=c("green","blue","purple","red"),
     pch=18,main="CYM sample types",
     xlab="",ylab="Shannon index",xaxt='n',          ylim=c(min(Resultados.agrupados$SH[5:8]-3*Resultados.agrupados$sd[5:8]),max(Resultados.agrupados$SH[5:8]+3*Resultados.agrupados$sd[5:8])))
segments(0.1*(1:4),Resultados.agrupados$SH[5:8]-3*Resultados.agrupados$sd[5:8],
         0.1*(1:4),Resultados.agrupados$SH[5:8]+3*Resultados.agrupados$sd[5:8],
         col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
       pch=18,legend=c("T0","T1","T2C","T2HW"),seg.len=0.5,cex=0.5)

All four SH indices are significantly different, except for T1 and T2C.

We can use Hutcheson t-test to decide whether Shannon indices are significantly different. We perform pairwise comparisons and adjust the p-values using the Holm method. The conclusions are the same.

pvalSH.G=matrix(NA,nrow=8,ncol=8)
for (i in 1:7){
  for (j in (i+1):8){
   pvalSH.G[i,j]= Hutcheson_t_test(DF.0.agrup[i,],DF.0.agrup[j,])$p.value
  }
}
pvalSH.G=matrix(p.adjust(pvalSH.G,method="holm"),nrow=8)

colnames(pvalSH.G)=levels(Tipos)
row.names(pvalSH.G)=levels(Tipos)

flextable(data.frame(round(pvalSH.G,6)))|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

CAU_T0

CAU_T1

CAU_T2C

CAU_T2HW

CYM_T0

CYM_T1

CYM_T2C

CYM_T2HW

0

0.000000

0.000000

0

0

0.000000

0

0.950346

0.179925

0

0

0.000000

0

0.228577

0

0

0.000000

0

0

0

0.000000

0

0

0.000000

0

0.950346

0

0

4.2 Behaviour of Shannon indices under thresholds

We address the following question: If we set a threshold \(p_0\), we only retain from each sample the OTUs that appear with a relative frequency \(\geqslant p_0\) and calculate the Shannon index (SH) of those samples, how does its longitudinal behavior vary from T0->T1->T2C->T2HW as \(p_0\) increases?

We use the following method to answer this question:

  • We consider \(p_0\) thresholds from 0.00005 (0.005%) to 0.05 (5%), advancing in steps of 0.00005.
  • For each threshold, in each sample, we retain the OTUs whose relative frequency is \(\geqslant p_0\)
  • Then, we group the samples by sample type and compute the SH of the grouped samples.
  • We visually analyze the trend in SH among CAU and CYM samples as \(p_0\) increases.
  • We also compute, for each type of sample and for relevant thresholds, the proportion of OTUs present in at least one sample of that type whose relative frequency in all three samples of that type is greater than or equal to the threshold.

We obtain that, in the CAU samples, at some point between the 0.15% and 0.2% thresholds, and in the CYM samples, at some point between the 0.1% and 0.15% thresholds, the longitudinal behavior of the SH changes for the first time from that observed in the whole samples, shifting from a rise-fall-fall pattern to a rise-fall-rise pattern.

For each type of CAU sample, around 12-13% of OTUs present in at least one sample of this type have relative frequency in all three samples of that type greater than or equal to 0.1%. For CYM sample types, this figure descends to 9-10%.

# threshold
llindar=function(x,l){
if (x>=l){return(1)}
  else{return(0)}
}
#Global sample
DF.0=as.matrix(DF.0)
DF.0.prop=DF.0/rowSums(DF.0)
DF.0.agrup=as.matrix(DF.0.agrup)
DF.0.agrup.prop=DF.0.agrup/rowSums(DF.0.agrup)

Original sample sizes and range of proportions within them:

Tamaños=rowSums(DF.0)
Props.min=round(apply(DF.0.prop,MARGIN=1,function(x){min(x[x>0])}),6)
Props.max=apply(DF.0.prop,MARGIN=1,function(x){max(x[x>0])})
Info=data.frame(Tamaños,Props.min,Props.max)
names(Info)=c("Size","Smallest prop.","Largest prop.")

flextable(Info)|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Size

Smallest prop.

Largest prop.

32452

0.000031

0.05805497

45161

0.000022

0.05832466

26298

0.000038

0.05354019

31355

0.000032

0.05243183

30592

0.000033

0.05458944

29368

0.000034

0.07147235

30554

0.000033

0.05194083

33119

0.000030

0.05694616

26568

0.000038

0.06884222

62616

0.000016

0.06188514

43726

0.000023

0.06026163

50715

0.000020

0.05223307

57062

0.000018

0.10017174

38684

0.000026

0.09365629

56482

0.000018

0.09686272

145523

0.000007

0.11633900

122687

0.000008

0.08515980

82489

0.000012

0.08112597

91168

0.000011

0.06932257

97111

0.000010

0.09009278

97125

0.000010

0.12887516

88157

0.000011

0.14189457

94560

0.000011

0.11748096

89922

0.000011

0.11695692

4.2.1 Original sample

Resultados.Agrupados=data.frame(
  SH=apply(DF.0.agrup,MARGIN=1,SH),
  Varianza=apply(DF.0.agrup,MARGIN=1,Var.SH))
##
plot(0.1*(1:8),Resultados.Agrupados$SH,col=rep(c("green","blue","purple","red"),2),
     pch=c(rep(20,4),rep(18,4)),
     xlab="",ylab="SH",xaxt='n',
     main="Grouped samples")
segments(0.1*(1:8),Resultados.Agrupados$SH-3*sqrt(Resultados.Agrupados$Varianza),
         0.1*(1:8),Resultados.Agrupados$SH+3*sqrt(Resultados.Agrupados$Varianza),
         col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("topright",col=c("black","black","green","blue","purple","red"),
       pch=c(20,18,NA,NA,NA,NA),
       lty=c(NA,NA,1,1,1,1),
      legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
       seg.len=0.5,cex=0.5)

#
plot(0.1*(1:4),Resultados.Agrupados$SH[1:4],col=c("green","blue","purple","red"),
     pch=20,main="Grouped samples: CAU",
     xlab="",ylab="SH",xaxt='n',
     ylim=c(min(Resultados.Agrupados$SH[1:4]-3*sqrt(Resultados.Agrupados$Varianza[1:4])),
              max(Resultados.Agrupados$SH[1:4]+3*sqrt(Resultados.Agrupados$Varianza[1:4]))))
segments(0.1*(1:4),Resultados.Agrupados$SH[1:4]-3*sqrt(Resultados.Agrupados$Varianza[1:4]),
         0.1*(1:4),Resultados.Agrupados$SH[1:4]+3*sqrt(Resultados.Agrupados$Varianza[1:4]),
         col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
       pch=20,legend=c("T0","T1","T2C","T2HW"),cex=0.5)

#
plot(0.1*(1:4),Resultados.Agrupados$SH[5:8],col=c("green","blue","purple","red"),
     pch=18,main="Grouped samples: CYM",
     xlab="",ylab="SH",xaxt='n',          ylim=c(min(Resultados.Agrupados$SH[5:8]-3*sqrt(Resultados.Agrupados$Varianza[5:8])),max(Resultados.Agrupados$SH[5:8]+3*sqrt(Resultados.Agrupados$Varianza[5:8]))))
segments(0.1*(1:4),Resultados.Agrupados$SH[5:8]-3*sqrt(Resultados.Agrupados$Varianza[5:8]),
         0.1*(1:4),Resultados.Agrupados$SH[5:8]+3*sqrt(Resultados.Agrupados$Varianza[5:8]),
         col=c("green","blue","purple","red"),lwd=1.5)
legend("topleft",col=c("green","blue","purple","red"),
       pch=18,legend=c("T0","T1","T2C","T2HW"),cex=0.5)

4.2.2 Enter thresholds

Resultados=c()
for (i in 1:1000){
print(i)
i0=i*0.00005
llindar.0=function(x){llindar(x,i0)}
DD=vapply(DF.0.prop, llindar.0, numeric(1))*DF.0
DD.agrup=aggregate(DD,by=list(Tipos),FUN=sum)[,-1]
Resultados=rbind(Resultados,
       c(apply(DD.agrup,MARGIN=1,SH),#Shannon
       apply(DD.agrup,MARGIN=1,sd.SH),#Variança
       apply(DD.agrup,MARGIN=1,SH)-3*apply(DD.agrup,MARGIN=1,sd.SH), #LE de l'IC
       apply(DD.agrup,MARGIN=1,SH)+3*apply(DD.agrup,MARGIN=1,sd.SH),#UE  de l'IC
       
       i0 #llindar
       ))
}


Resultados=data.frame(Resultados)

names(Resultados)=c(paste("SH",levels(Tipos),sep="."),
      paste("sd",levels(Tipos),sep="."),
      paste("LE",levels(Tipos),sep="."),
      paste("UE",levels(Tipos),sep="."),
      "threshold")

saveRDS(Resultados, file="Resultados.Shannon.Umbrales.RData")
p.valores=c()
for (i in 1:1000){
print(i)
i0=i*0.00005
llindar.0=function(x){llindar(x,i0)}
DD=vapply(DF.0.prop, llindar.0, numeric(1))*DF.0
DD.agrup=aggregate(DD,by=list(Tipos),FUN=sum)[,-1]


pvalSH.G=matrix(NA,nrow=8,ncol=8)
for (i in 1:7){
  for (j in (i+1):8){
   pvalSH.G[i,j]= Hutcheson_t_test(DD.agrup[i,],DD.agrup[j,])$p.value
  }
}
pvalSH.G=p.adjust(pvalSH.G,method="holm")

p.valores=rbind(p.valores,
                c(i0,
                  pvalSH.G))

}

write.csv(p.valores, file="p.valores.Shannon.Umbrales.csv",row.names=FALSE)

The results are in the file “Resultados.Shannon.Umbrales.RData”. Its variables are (for each type of sample: CAU_T0, CAU_T1 etc.):

  • SH.type: The Shannon index (SH) of the union of the three samples of that type

  • sd.type: The estimated standard deviation of the SH of the union of the three samples of that type

  • LE.type: SH minus 3 times the estimated standard deviation

  • UE.type: SH plus 3 times the estimated standard deviation

  • Threshold: The corresponding threshold

We show its first 10 rows (rounded, and splitted into different tables, for horizontal space reasons), corresponding to thresholds from 0.00005 to 0.0005. :

Resultados=readRDS("Resultados.Shannon.Umbrales.RData")
names(Resultados)=c(paste("SH",levels(Tipos),sep="."),
      paste("sd",levels(Tipos),sep="."),
      paste("LE",levels(Tipos),sep="."),
      paste("UE",levels(Tipos),sep="."),
      "Threshold")

Resultados.r4=Resultados
Resultados.r4[,1:32]=round(Resultados.r4[,1:32],4)

flextable(Resultados.r4[1:10,c(33,1+8*(0:3),2+8*(0:3))])|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Threshold

SH.CAU_T0

sd.CAU_T0

LE.CAU_T0

UE.CAU_T0

SH.CAU_T1

sd.CAU_T1

LE.CAU_T1

UE.CAU_T1

0.00005

5.1252

0.0053

5.1092

5.1412

5.1997

0.0058

5.1824

5.2170

0.00010

5.0481

0.0051

5.0326

5.0635

5.1092

0.0055

5.0926

5.1257

0.00015

5.0011

0.0050

4.9859

5.0162

5.0479

0.0054

5.0318

5.0641

0.00020

4.9245

0.0049

4.9098

4.9391

4.9787

0.0052

4.9630

4.9944

0.00025

4.8751

0.0048

4.8607

4.8895

4.9337

0.0052

4.9182

4.9491

0.00030

4.8395

0.0047

4.8253

4.8537

4.8735

0.0050

4.8584

4.8887

0.00035

4.7807

0.0046

4.7668

4.7946

4.8427

0.0050

4.8277

4.8577

0.00040

4.7456

0.0046

4.7319

4.7594

4.8042

0.0049

4.7893

4.8190

0.00045

4.7104

0.0045

4.6968

4.7239

4.7588

0.0049

4.7442

4.7733

0.00050

4.6612

0.0044

4.6479

4.6745

4.7291

0.0048

4.7146

4.7435

flextable(Resultados.r4[1:10,c(33,3+8*(0:3),4+8*(0:3))])|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Threshold

SH.CAU_T2C

sd.CAU_T2C

LE.CAU_T2C

UE.CAU_T2C

SH.CAU_T2HW

sd.CAU_T2HW

LE.CAU_T2HW

UE.CAU_T2HW

0.00005

5.1943

0.0058

5.1769

5.2117

5.1539

0.0043

5.1408

5.1669

0.00010

5.1027

0.0056

5.0860

5.1194

5.0714

0.0042

5.0588

5.0839

0.00015

5.0536

0.0055

5.0372

5.0699

5.0186

0.0041

5.0063

5.0309

0.00020

4.9697

0.0053

4.9539

4.9855

4.9529

0.0040

4.9409

4.9649

0.00025

4.9234

0.0052

4.9078

4.9389

4.9069

0.0039

4.8952

4.9187

0.00030

4.8896

0.0051

4.8743

4.9050

4.8537

0.0038

4.8422

4.8653

0.00035

4.8425

0.0050

4.8274

4.8576

4.8159

0.0038

4.8045

4.8274

0.00040

4.8079

0.0050

4.7929

4.8228

4.7660

0.0037

4.7548

4.7772

0.00045

4.7851

0.0050

4.7702

4.7999

4.7365

0.0037

4.7254

4.7476

0.00050

4.7406

0.0049

4.7259

4.7553

4.7040

0.0037

4.6930

4.7150

flextable(Resultados.r4[1:10,c(33,5+8*(0:3),6+8*(0:3))])|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Threshold

SH.CYM_T0

sd.CYM_T0

LE.CYM_T0

UE.CYM_T0

SH.CYM_T1

sd.CYM_T1

LE.CYM_T1

UE.CYM_T1

0.00005

4.8502

0.0049

4.8355

4.8649

5.0004

0.0032

4.9909

5.0100

0.00010

4.7581

0.0047

4.7440

4.7722

4.9137

0.0031

4.9045

4.9229

0.00015

4.6853

0.0046

4.6717

4.6990

4.8593

0.0030

4.8503

4.8684

0.00020

4.6284

0.0045

4.6149

4.6418

4.7987

0.0030

4.7898

4.8075

0.00025

4.5794

0.0044

4.5662

4.5927

4.7475

0.0029

4.7387

4.7562

0.00030

4.5353

0.0044

4.5222

4.5483

4.7035

0.0029

4.6948

4.7121

0.00035

4.5003

0.0043

4.4874

4.5132

4.6659

0.0029

4.6573

4.6745

0.00040

4.4627

0.0043

4.4499

4.4754

4.6107

0.0028

4.6023

4.6191

0.00045

4.4162

0.0042

4.4036

4.4288

4.5693

0.0028

4.5609

4.5776

0.00050

4.3812

0.0042

4.3687

4.3937

4.5383

0.0028

4.5300

4.5466

flextable(Resultados.r4[1:10,c(33,7+8*(0:3),8+8*(0:3))])|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Threshold

SH.CYM_T2C

sd.CYM_T2C

LE.CYM_T2C

UE.CYM_T2C

SH.CYM_T2HW

sd.CYM_T2HW

LE.CYM_T2HW

UE.CYM_T2HW

0.00005

5.0126

0.0036

5.0019

5.0233

4.9765

0.0037

4.9655

4.9875

0.00010

4.9095

0.0034

4.8992

4.9197

4.8922

0.0036

4.8816

4.9029

0.00015

4.8446

0.0033

4.8346

4.8546

4.8148

0.0035

4.8044

4.8252

0.00020

4.7855

0.0033

4.7756

4.7953

4.7597

0.0034

4.7494

4.7699

0.00025

4.7292

0.0032

4.7195

4.7388

4.7049

0.0034

4.6948

4.7150

0.00030

4.6798

0.0032

4.6702

4.6893

4.6562

0.0033

4.6462

4.6661

0.00035

4.6419

0.0032

4.6324

4.6513

4.6095

0.0033

4.5997

4.6193

0.00040

4.5895

0.0031

4.5802

4.5988

4.5639

0.0032

4.5542

4.5736

0.00045

4.5482

0.0031

4.5390

4.5575

4.5322

0.0032

4.5226

4.5418

0.00050

4.5203

0.0030

4.5111

4.5294

4.5011

0.0032

4.4915

4.5106

We start by checking several thresholds:

  • 0.0001: OTUs that represent more than 0.01% in their samples
  • 0.001: OTUs that represent more than 0.1% in their samples
  • 0.0025: OTUs that represent more than 0.25% in their samples
  • 0.005: OTUs that represent more than 0.5% in their samples
  • 0.0075: OTUs that represent more than 0.75% in their samples
  • 0.01: OTUs that represent more than 1% in their samples
  • 0.025: OTUs that represent more than 2.5% in their samples
Dibu.global=function(p){
  Tall=as.matrix(Resultados[Resultados$Threshold==p,])
plot(0.1*(1:8),Tall[1,1:8],
     col=rep(c("green","blue","purple","red"),2),
     pch=c(rep(20,4),rep(18,4)),
     xlab="",ylab="SH",xaxt='n',main=paste("Threshold",p,sep=" "))
segments(x0=0.1*(1:8),y0=Tall[1,17:24],
         x1=0.1*(1:8),y1=Tall[1,25:32],
         col=rep(c("green","blue","purple","red"),2),lwd=1.5)
legend("topright",col=c("black","black","green","blue","purple","red"),
       pch=c(20,18,NA,NA,NA,NA),
       lty=c(NA,NA,1,1,1,1),
      legend=c("CAU","CYM","T0","T1","T2C","T2HW"),
       seg.len=0.5,cex=0.5)
}
Dibu.global(0.0001)

Dibu.global(0.001)

Dibu.global(0.0025)

Dibu.global(0.005)

Dibu.global("0.0075")

Dibu.global(0.01)

Dibu.global(0.025)

As it can be seen, the longitudinal behavior of Shannon indices changes from p0=0.1% (which is similar to the global behaviour) to 0.25%.

Let us see the graphics for thresholds between 0.001 and 0.005:

for (i in 0:8){
Dibu.global(0.001+0.0005*i)
}

4.2.3 How many OTUs are we talking about?

For each type of samples, the next graph represents the proportion of OTUs present in at least one sample of this type whose relative frequency in all three samples of that type is greater than or equal to the percentage indicated on the x-axis. The proportions are also given in a table.

  n=seq(from=0.001,to=0.003,by=0.0005)
  N=length(n)
Result=n
for (i in 1:8){
 X1=DF.0.prop[3*(i-1)+1,]
 X1=sort(X1[X1!=0])
 X2=DF.0.prop[3*(i-1)+2,]
 X2=sort(X2[X2!=0])
 X3=DF.0.prop[3*(i-1)+3,]
 X3=sort(X3[X3!=0])

  PP=rep(0,N)
  for(j in 1:N){PP[j]=length(X1[X1>=n[j]] & X2[X2>=n[j]] & X3[X3>=n[j]])/length(X1[X1>0] | X2[X2>0] | X3[X3>0])}
  plot(100*n,PP,pch=20,type="l",lwd=1.5,xaxp=c(0.1,0.3,N),yaxp=c(0,0.2,20),xlab="%",ylab="Proportion of OTUs whose percentage is higher than ...",main=Tipos[i])
  abline(v=0.01*(0:100),lwd=0.5,col="red")
  abline(h=0.01*(0:100),lwd=0.5,col="red")
  abline(v=0.15,lwd=1,col="blue")
 Result=cbind(Result,round(PP,4)) 
}

Result=data.frame(Result)
names(Result)=c("Threshold",levels(Tipos))
row.names(Result)=NULL


flextable(Result)|>
   fontsize(size = 8, part = "all")|>
 width(width = 0.9) 

Threshold

CAU_T0

CAU_T1

CAU_T2C

CAU_T2HW

CYM_T0

CYM_T1

CYM_T2C

CYM_T2HW

0.0010

0.1269

0.1283

0.1394

0.1162

0.1027

0.0896

0.0922

0.0901

0.0015

0.0968

0.0929

0.1004

0.0829

0.0759

0.0662

0.0643

0.0683

0.0020

0.0751

0.0765

0.0797

0.0652

0.0610

0.0508

0.0539

0.0550

0.0025

0.0609

0.0650

0.0632

0.0546

0.0521

0.0388

0.0446

0.0459

0.0030

0.0509

0.0526

0.0563

0.0446

0.0417

0.0342

0.0348

0.0363

5 Samples divergence analysis

According to the Bray-Curtis distance (correctly applied to compositions), the CAU and CYM samples at T0 are more different than at T1, more similar at T1 than at T2C, and again more similar at T2HW than at T2C (or than at T1 or T0, for that matter). Overall, the global sets of CAU and CYM samples are more different than the starting T0 samples, as if part of the original differences were diluted. Are these differences statistically significant?

Well, we have devised a method to assess the statistical significance of the differences in median distances between two pairs of samples groups, based on a resampling procedure. Using this method, we obtain that these convergences and divergences from T0 to T1, from T1 to T2C, from T2C to T2HW, and from T0 to the global sample, are statistically significant.

DF.0.agrup=aggregate(DF.0,by=list(Tipos),FUN=sum)[,-1]

The next table shows, for each level T0, T1, T2C, T2HW, and Global (all samples), the median Bray-Curtis (BC) distances between the compositions of the CAU and CYM samples at those levels.

Times

Average BC-dist.

T0

0.4124

T1

0.4097

T2C

0.4493

T2HW

0.3766

Global

0.4189

The BC-distances decrease from T0 to T1. Then, from T1 to T2C they increase, while from T1 to T2HW they decrease further. Also, from T0 to Global it decreases. Are these “convergences” and “divergences” of samples statistically significant?

We want to perform a contrast with null hypothesis “The CAU_T1 and CYM_T1 samples are as similar as the CAU_T0 and CYM_T0 samples” and alternative hypothesis “The CAU_T1 and CYM_T1 samples are more similar than the CAU_T0 and CYM_T0 samples”.

To solve this contrast, we resort to a resampling process. The broad idea is:

  • On the one hand, we merge all 3 samples CAU_T0 in a single sample, and all 3 samples CYM_T0 in another single sample, and we repeat 1000 times the following:

    • We split the CAU merged sample into 3 disjoint random samples of the sizes of the original CAU_T0_1, CAU_T0_2, CAU_T0_3 samples forming it
    • We split the CYM merged sample into 3 disjoint random samples of the sizes of the original CYM_T0_1, CYM_T0_2, CYM_T0_3 samples forming it
    • We compute the median distance between the 3 CAU random samples and the 3 CYM random samples

    In this way, we obtain a vector of 1000 median distances between CAU and CYM for T0

  • On the other hand, we merge all 6 samples CAU_T0 and CAU_T1 in a single sample, and all 6 samples CYM_T0 and CYM_T1 in another single sample, and we repeat 1000 times the following:

    • We take from the CAU merged sample 3 disjoint random samples (without reposition) of the sizes of the original CAU_T0_1, CAU_T0_2, CAU_T0_3 samples
    • We take from the CYM merged sample into 3 disjoint random samples (without reposition) of the sizes of the original CYM_T0_1, CYM_T0_2, CYM_T0_3 samples
    • We compute the median distance between the 3 CAU random samples and the 3 CYM random samples

    In this way, we obtain a vector of 1000 median distances between CAU and CYM for the union of T0 and T1

The idea is that if the T1 samples are “as similar” as the T0 samples, the median distances between triples of disjoint random samples from the merged T0 samples should be similar to the median distances between triples of disjoint random samples (of the same sizes) from the merged T0+T1 samples. Conversely, if the T1 samples are “more similar” than the T0 samples, the median distances between triples of random samples from the merged T0 samples should tend to be larger than the median distances between triples of random samples (of the same sizes) from the merged T0+T1 samples.

So, we can use as p-value the proportion of pairs (median distance between CAU and CYM for T0, median distance between CAU and CYM for T0+T1) where the first entry is smaller than the second entry. If the T1 samples are “as similar” as the T0 samples, we would expect this to happen half of times, while if the T0 samples are more different than the T1 samples, we would expect this to happen with low frequency.

The problem is that the CYM samples are larger than CAU samples, and it can bias the samples; for instance if we merge CAU_T1 and CYM_T1 samples and take random samples, most of the taxa will come from CYM:

Samples

Sizes

CAU_T0_1

32452

CAU_T0_2

45161

CAU_T0_3

26298

CAU_T1_1

31355

CAU_T1_2

30592

CAU_T1_3

29368

CYM_T0_1

57062

CYM_T0_2

38684

CYM_T0_3

56482

CYM_T1_1

145523

CYM_T1_2

122687

CYM_T1_3

82489

To solve this drawback, and since we apply BC-distances to proportions, we reescale all samples to size 106 (up to rounding; we need integer frequencies for the simulations) before merging.

DF=round(DF.0.prop*10^6)
DF.prop=DF/rowSums(DF)
DF.agrup=aggregate(DF,by=list(Tipos),FUN=sum)[,-1]
Sizes=rowSums(DF)

The new sample sizes are

Samples

Sizes

CAU_T0_1

1000070

CAU_T0_2

999907

CAU_T0_3

999911

CAU_T1_1

1000070

CAU_T1_2

1000030

CAU_T1_3

999912

CYM_T0_1

1000234

CYM_T0_2

1000111

CYM_T0_3

1000043

CYM_T1_1

1000078

CYM_T1_2

999876

CYM_T1_3

999852

Let us check first that with these reescaled samples (where, due to rounding, the proportions are not exactly the original ones but very close to them), the behaviour of the distances is the same as the original one:

Times

Average BC-dist.

T0

0.4124

T1

0.4097

T2C

0.4493

T2HW

0.3766

Global

0.4189

No change.

We shall use the following function:

  • Function Simulation
    • takes the different samples CAU y CYM at the specified times, for instance, CAU_T0, CAU_T1, CYM_T0 and CYM_T1, or simply CAU_T0 and CYM_T0
    • considers as “base” samples the samples CAU and CYM at the first of each set of times, for instance CAU_T0 and CYM_T0
    • merges all samples CAU in one sample, and all samples CYM in another sample
    • it extracts, from the CAU merged sample, 3 disjoint random samples (without reposition) of the same sizes as the reescaled CAU base samples, and from the CYM merged sample, 3 disjoint random samples of the same sizes as the reescaled CYM base samples
    • it computes the median specified distance between the 3 CAU random samples and the 3 CYM random samples
#' II indexes of the CAU grouped samples
#' The first one is the "base" time
#' Example: CAU T0+T1 and CYM T0+T1 vs CAU T0 and CYM T0
#' II=c(1,2)
#' Example: CAU_T0 vs CYM_T0
#' II=c(1)
Simulation=function(II,distancia){
#CAU
XX1=c()
for (i in 1:length(II)){
  XX1=c(XX1,rep(OTUs,DF.agrup[II[i],]))
}
XX1=factor(XX1,levels=OTUs)
ind1=1:length(XX1)
#CYM
XX2=c()
for (i in 1:length(II)){
  XX2=c(XX2,rep(OTUs,DF.agrup[4+II[i],]))
}
XX2=factor(XX2,levels=OTUs)
ind2=1:length(XX2)

# sizes of base samples
SS=Sizes[c(3*(II[1]-1)+1:3,3*((4+II[1])-1)+1:3)]


ind1.1=sample(ind1,SS[1],replace=FALSE)
ind1.2=sample(ind1[-ind1.1],SS[2],replace=FALSE)
ind1.3=sample(ind1[-c(ind1.1,ind1.2)],SS[3],replace=FALSE)

ind2.1=sample(ind2,SS[4],replace=FALSE)
ind2.2=sample(ind2[-ind2.1],SS[5],replace=FALSE)
ind2.3=sample(ind2[-c(ind2.1,ind1.2)],SS[6],replace=FALSE)

Y1.1=prop.table(table(factor(XX1[ind1.1],levels=OTUs)))
Y1.2=prop.table(table(factor(XX1[ind1.2],levels=OTUs)))
Y1.3=prop.table(table(factor(XX1[ind1.3],levels=OTUs)))
Y2.1=prop.table(table(factor(XX2[ind2.1],levels=OTUs)))
Y2.2=prop.table(table(factor(XX2[ind2.2],levels=OTUs)))
Y2.3=prop.table(table(factor(XX2[ind2.3],levels=OTUs)))

YY=rbind(Y1.1,Y1.2,Y1.3,Y2.1,Y2.2,Y2.3)

BC=c()
for (i in 1:3){
  for (j in 4:6){
    BC=c(BC,vegan::vegdist(YY[c(i,j),], method=distancia))
  }
}
median(BC)
}

For reproducibility, we fix a random seed

#initial_seed=as.integer(Sys.time())
#print(initial_seed)
## [1] 1725986963
#initial_seed%%10000
## [1] 6963

set.seed(6963)

Let us perform the desired contrast for T0 vs T1 and the BC-distance

Sim.T0T1.T0.BC=replicate(1000,Simulation(c(1,2),"bray"))
Sim.T0.BC=replicate(1000,Simulation(c(1),"bray"))

saveRDS(Sim.T0T1.T0.BC, file="Sim.T0T1.T0.BC.RData")
saveRDS(Sim.T0.BC, file="Sim.T0.BC.RData")

The median BC-distances for the random T0+T1 samples go from 0.3877 to 0.3895, while the median BC-distances for the T0 samples go from 0.4057 to 0.407. So, all median BC-distances for T0 are larger than any median distance for T0+T1. Moreover, recall that the median BC-distance between the (reescaled) CAU_T0 and CYM_T0 samples was 0.4124, larger than the largest median BC-distance for the random T0+T1 samples.

For robustness, let us perform the contrast with T1 as base samples

Sim.T0T1.T1.BC=replicate(1000,Simulation(c(2,1),"bray"))
Sim.T1.BC=replicate(1000,Simulation(c(2),"bray"))

saveRDS(Sim.T0T1.T1.BC, file="Sim.T0T1.T1.BC.RData")
saveRDS(Sim.T1.BC, file="Sim.T1.BC.RData")

In this case, the median BC-distances for the T0+T1 samples go from 0.3874 to 0.3894, while the median BC-distances for the T1 samples go from 0.3792 to 0.3807. The median BC-distance between the (reescaled) CAU_T1 and CYM_T1 samples was 0.4097.

Now T1 vs T2C, taking T1 as base samples (for the sake of completeness, we repeat the simulation for T1).

Sim.T1T2C.T1.BC=replicate(1000,Simulation(c(2,3),"bray"))
Sim.T1.BC.2=replicate(1000,Simulation(c(2),"bray"))

saveRDS(Sim.T1T2C.T1.BC, file="Sim.T1T2C.T1.BC.RData")
saveRDS(Sim.T1.BC.2, file="Sim.T1.BC.2.RData")

The median BC-distances for the T1+T2C samples go from 0.3971 to 0.3994, while the median BC-distances for the T1 samples go from 0.3792 to 0.3806. The median BC-distance between the (reescaled) CAU_T1 and CYM_T1 samples was 0.4097.

And T2C vs T2HW, with T2C as base samples.

Sim.T2CT2HW.T2C.BC=replicate(1000,Simulation(c(3,4),"bray"))
Sim.T2C.BC=replicate(1000,Simulation(c(3),"bray"))

saveRDS(Sim.T2CT2HW.T2C.BC,file="Sim.T2CT2HW.T2C.BC.RData")
saveRDS(Sim.T2C.BC, file="Sim.T2C.BC.RData")

The median BC-distances for the T2C+T2HW samples go from 0.385 to 0.3869, while the median BC-distances for the T2C samples go from 0.4274 to 0.4288. The median BC-distance between the (reescaled) CAU_T2C and CYM_T2C samples was 0.4493

Finally, the global samples:

Sim.Global.BC=replicate(1000,Simulation(c(1,2,3,4),"bray"))
Sim.T0.BC=replicate(1000,Simulation(c(1),"bray"))

saveRDS(Sim.Global.BC, file="Sim.Global.BC.RData")
saveRDS(Sim.T0.BC, file="Sim.T0.BC.RData")

The median BC-distances bewteen CAU and CYM for the Global samples go from 0.3805 to 0.3828, while the median BC-distances for the T0 samples go from 0.4057 to 0.407. The median BC-distance between the (reescaled) CAU_T0 and CYM_T0 samples was 0.4124

6 Differential abundance analysis

For each pair of sample types considered, we proceed as follows:

  1. We load TablaOTUs.xlsx and retain only the relevant “global” samples (all, only CAU, or only CYM).

  2. We remove OTUs with 2 or fewer reads across the selected samples.

  3. We remove OTUs that appear in only one of the selected samples, in order to apply Bayesian-multiplicative zero imputation.

  4. We perform Bayesian-multiplicative zero imputation on the selected samples.

  5. We retain only the samples corresponding to the time points to be compared.

  6. We assess whether and how well the two types of samples are clearly separated in a hierarchical clustering.

  7. We study how well the two types of samples separate when only a random group of OTUs of fixed size is considered, for different sizes. This provides a reference against which we evaluate the significance of the separation observed with specific groups of OTUs.

    To evaluate how a group of taxa separates two groups of samples CAUx and CYMx, we use the index
    \[ \frac{\text{distance between CAU${}_x$ and CYM${}_x$}}{\sqrt{(\text{dist. within CAU${}_x$})^2 + (\text{dist. within CYM${}_x$})^2}} \]
    where the distances are calculated as the heights in the hierarchical tree obtained by clustering the samples with the selected taxa.

    In each one of these simulations, we fix the random seed as the last four digits of Sys.time() at the moment of execution.

  8. We search for groups of OTUs with significantly different abundances in the two sample types by applying several methods: significance according to ALDeX (adjusted p-values from the Kruskal–Wallis test); relevant log-ratios; bacterial signatures identified with coda_glmnet; and significance according to simper.

  9. When the different methods identify groups of OTUs that consistently separate the two sample types, we record them in a table Significant, and finally we export this table as a csv file. Its name is “OTU_Significativos_” followed by the pair of sample types.

6.1 CAU vs CYM

tax_otu=data.frame(read_excel("TablaOTUs.xlsx"))
Samples=tax_otu$ID 
n.OTUs.original=dim(tax_otu)[2]-2
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
DF.0=tax_otu[,-c(1,2)]
taxa.original=taxa
OTUs=paste("OTU",1:n.OTUs.original,sep="")
colnames(DF.0)=OTUs
DF.0.total=DF.0
colnames(DF.0.total)=OTUs
rownames(DF.0.total)=Samples
rm(tax_otu)

Type=as.factor(rep(c("CAU","CYM"),each=12))
colors=c("green","blue")[Type]
#' Green: CAU  
#' Blue: CYM  

The OTUs that appear with 2 or fewer hits in the total sample represent 15.8% of the total.

Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
# DF.0.hits: para cada OTU y cada muestra, 1 si OTU in Muestra y 0 si no
hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))

OTUs that appear in only one sample but represent more than 0.05% of that sample:

Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
            Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))  
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)  
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL


Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
  kbl() %>%
  kable_styling()
Sample OTUs Reads Proportions
CAU_T0_2 OTU966 55 0.0012183

We remove the OTUs that appear in only one samples and we perform Bayesian-multiplicative zero imputation.

DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]
row.names(DF.0)=Samples
DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
row.names(DF)=Samples

rm(DF.0.hits)
rm(Miseria)

From 3088 initial taxa, we have reduced to 2542.

6.1.1 How different are the samples?

HC=ClusterHC(DF,Grups=Type,barplot=FALSE,colores=colors)

separacio.global=Sep(HC)

The two sample types are perfectly separated from the start. The separation between groups is 2.47.

We observe below that the two sample types are so different that almost any subset of OTUs of reasonable size classifies them correctly. This will serve as a reference to compare how much improvement we gain with the “good” sets identified later.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711395644
initial_seed %% 10000
# [1] 5644
set.seed(5644)
Experimento=function(i)
{
  ii=sample(dim(DF)[2],i,rep=FALSE)
  DFtemp=DF[,ii]
  CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
  c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:round((dim(DF)[2])/2))
  {
  print(i)
  EE=replicate(n,Experimento(i))
  EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
  F.SS=rbind(F.SS, c(i,
                     length(EE.2)/250,
                      mean(EE.2),
                      quantile(XX,0.025),
                      quantile(XX,0.975)
  ))
  }
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAUvsCYM.RData")
F.SS=readRDS("Fraccio.Separadors.CAUvsCYM.RData")

The first plot shows, for each \(n\) up to half the total number of OTUs, the proportion of samples of size \(n\) that perfectly classify CAU and CYM (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We have used smoothing splines to smooth the curves.

F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)


plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,1250),xaxp=c(0,1250,25),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

The second plot shows, for each \(n\) up to half the total number of OTUs, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU and CYM (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve). This plot will serve as a reference later.

fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,1250),xaxp=c(0,1250,25),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")

`

6.1.2 OTUs significant according to ALDeX

We always use the aldexCesc.clr version of the aldex.clr function (from the script funcionsCODAMETACIRCLE.R) to generate the simulated samples.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711411362
initial_seed %% 10000
# [1] 1362
set.seed(1362)
repl.kw.Cescv2.CaCy=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex.kw.Cescv2.CaCy=aldex.kw(repl.kw.Cescv2.CaCy, verbose=FALSE)
#'
saveRDS(aldex.kw.Cescv2.CaCy, file="aldex_kw_Cescv2_CaCy.RData")
p.valores_kw=readRDS("aldex_kw_Cescv2_CaCy.RData") 

Adjusted Kruskal–Wallis p-value < 0.05

sep.kw.005=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1)

kw.005=which(p.valores_kw[,2]<0.05) 

There are 804 taxa with adjusted Kruskal–Wallis p-value < 0.05.
The separation between groups is 2.48.

Adjusted Kruskal–Wallis p-value < 0.01

sep.kw.001=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.01,1)

kw.001=which(p.valores_kw[,2]<0.01) 

There are 605 taxa with adjusted Kruskal–Wallis p-value < 0.01.
The separation between groups is 2.35.

Adjusted Kruskal–Wallis p-value < 0.001

sep.kw.0001=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.001,1)

kw.0001=which(p.valores_kw[,2]<0.001) 

There are 408 taxa with adjusted Kruskal–Wallis p-value < 0.001.
The separation between groups is 2.89.

Indicador=function(x,k=n.OTUs.original){
  xx=rep(0,k)
  y=as.numeric(gsub("\\D+", "", OTUs[x]))
  xx[y]=1
  
  return(xx)
}
Significant=data.frame(
  OTUs=paste("OTU",1:n.OTUs.original,sep=""),
  taxa=taxa.original,
  Aldex.kw.0.05=Indicador(kw.005),
  Aldex.kw.0.01=Indicador(kw.001),
  Aldex.kw.0.001=Indicador(kw.0001)
  )

6.1.3 Taxa with maximum relevant log-ratios

We plot the assock values (see the main text) and obtain its maximum.

LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAUCYM.RData")
LRS=readRDS("LRS_CAUCYM.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
    assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CAUCYM.RData")
assoc=readRDS("assoc_CAUCYM.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most logratio-relevant taxa."
     ,pch=20,cex=0.5)

m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
DF.Imp=DF[,impLR]
HC=ClusterHC(DF.Imp,dendrograma=FALSE,barplot=FALSE, Grups=Type)
LR.max=c(length(impLR),Sep(HC))

The maximum is attained at the set of 103 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 6.58.

Significant=cbind(Significant,
                     Log_ratio.max=Indicador(impLR))

6.1.4 Taxa relevant for the bacterial signature (according to coda_glmnet)

We always apply the coda_glmnet function to the frequency matrix with zeros already imputed.

coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
saveRDS(coda.glmnet, file="coda.glmnet.CAUCYM.RData")
coda.glmnet=readRDS("coda.glmnet.CAUCYM.RData")
coda.glmnet$`signature plot`

impGLMNET=coda.glmnet$taxa.num
DF.Imp=DF[,impGLMNET]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

SB.glmnet=c(length(coda.glmnet$taxa.num),Sep(HC))

There are 118 significant taxa.
The separation between groups is 4.55.

Significant=cbind(Significant,
glmnet_sign.CAU=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`>0]),
glmnet_sign.CYM=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`<0]))

6.2 Simper

We always use simper from vegan on the (original) proportion matrix.

DF.prop=DF.0.total/rowSums(DF.0.total)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.rel.prop=SMP.prop$CAU_CYM$ord[which(SMP.prop$CAU_CYM$cusum<=0.7)]
OTUs.simper.rel.prop=sort(attr(SMP.prop$CAU_CYM$cusum[which(SMP.prop$CAU_CYM$cusum<=0.7)],"names"))

DF.Imp=DF[,OTUs.simper.rel.prop]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

simper.rel.props=c(length(indexos_SMP.rel.prop),Sep(HC))

There are 93 significant taxa. They perfectly separate the two sample types. The separation between groups is 2.88.

Indicador.Gl=function(x,k=n.OTUs.original){
  xx=rep(0,k)
 xx[x]=1
  return(xx)
}

Significant=cbind(Significant,
simper.props=Indicador.Gl(indexos_SMP.rel.prop))
Significant=cbind(Significant,
t(DF.prop))

6.2.1 Summary

write.csv2(Significant,"OTU_Significativos_CAUvsCYM_def.csv",row.names=FALSE)
Resultados=rbind(
sep.kw.0001,sep.kw.005,sep.kw.001,
LR.max,SB.glmnet,
simper.rel.props
)
row.names(Resultados)=c("Aldex.0.001","Aldex.0.05","Aldex.0.01",
"LR","glmnet", "simper")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xaxp=c(0,1000,40),yaxp=c(0,8.5,17),ylim=c(0,8.5),xlim=c(0,1000))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)

Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet.

Significant.capat=Significant[,c(1,2,6,7,8)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]

There are 41 such taxa. They are

Significant.capat[,1:2]%>%
  kbl() %>%
  kable_styling()
OTUs taxa
OTU22 OTU22 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatitalea.s__uncultured_Desulfobacteraceae
OTU30 OTU30 d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.g__PHOS.HE36.s__uncultured_bacterium
OTU32 OTU32 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.guncultured.
OTU35 OTU35 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__LCP.80.s__uncultured_delta
OTU38 OTU38 Unassigned.__.....__
OTU39 OTU39 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.g__Lentimicrobiaceae.s__uncultured_Bacteroidetes
OTU42 OTU42 d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.guncultured.
OTU45 OTU45 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae..
OTU46 OTU46 d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Ignavibacteriaceae.gIgnavibacterium.
OTU70 OTU70 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.oDesulfobacterales...
OTU109 OTU109 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gLCP.80.
OTU133 OTU133 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.gDesulfatiglans.
OTU141 OTU141 d__Bacteria.p__Bdellovibrionota.c__Oligoflexia.o__0319.6G20.f__0319.6G20.g0319.6G20.
OTU213 OTU213 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gDesulfatitalea.
OTU218 OTU218 d__Bacteria.p__Spirochaetota.c__Leptospirae.o__Leptospirales.f__Leptospiraceae.g__RBG.16.49.21.s__uncultured_organism
OTU223 OTU223 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__ST.12K33.g__ST.12K33.s__uncultured_Cytophagales
OTU229 OTU229 d__Bacteria.p__Spirochaetota.c__Leptospirae.o__Leptospirales.f__Leptospiraceae.g__RBG.16.49.21.s__uncultured_bacterium
OTU230 OTU230 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_Bacteroidetes
OTU253 OTU253 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_delta
OTU255 OTU255 d__Bacteria.p__Spirochaetota.c__V2072.189E03.o__V2072.189E03.f__V2072.189E03.g__V2072.189E03.s__uncultured_organism
OTU256 OTU256 d__Bacteria.p__Fermentibacterota.c__Fermentibacteria.o__Fermentibacterales.f__Fermentibacteraceae.g__Fermentibacteraceae.s__uncultured_bacterium
OTU269 OTU269 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.gCandidatus_Thiobios.
OTU451 OTU451 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gActibacter.
OTU498 OTU498 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_organism
OTU527 OTU527 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__Desulfotignum.s__uncultured_bacterium
OTU761 OTU761 d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Microtrichaceae.gSva0996_marine_group.
OTU811 OTU811 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfosarcina.s__uncultured_bacterium
OTU881 OTU881 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.g__uncultured.s__uncultured_gamma
OTU890 OTU890 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__uncultured_Bacteroidetes
OTU1107 OTU1107 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Sva0081_sediment_group.s__metagenome
OTU1329 OTU1329 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfococcaceae.gDesulfonema.
OTU1390 OTU1390 d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.g__PAUC43f_marine_benthic_group.s__saltmarsh_clone
OTU1419 OTU1419 d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.g__NB1.j.s__uncultured_bacterium
OTU1443 OTU1443 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__B2M28.f__B2M28.g__B2M28.s__uncultured_proteobacterium
OTU1461 OTU1461 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__Robiginitalea_myxolifaciens
OTU1532 OTU1532 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_organism
OTU1594 OTU1594 d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.gPHOS.HE36.
OTU1701 OTU1701 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Methyloligellaceae.gMethyloceanibacter.
OTU1839 OTU1839 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ectothiorhodospirales.f__Thioalkalispiraceae.g__Thioalkalispira.Sulfurivermis.s__uncultured_Thioalkalispiraceae
OTU2338 OTU2338 d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Xenococcaceae.gXenococcus_PCC.7305.
OTU2371 OTU2371 d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Lachnospiraceae.gRoseburia.
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

The separation between groups is 7.86.

We update the plot with the separations:

intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c("Aldex.0.001","Aldex.0.05","Aldex.0.01",
"LR","glmnet", "simper","intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xaxp=c(0,1000,40),yaxp=c(0,8.5,17),ylim=c(0,8.5),xlim=c(0,1000))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)

Resultados %>%
  kbl() %>%
  kable_styling()
Size Separation
Aldex.0.001 408 2.889614
Aldex.0.05 804 2.481799
Aldex.0.01 605 2.354071
LR 103 6.579296
glmnet 118 4.550477
simper 93 2.881239
intersection 41 7.863271

6.3 CAU T0 vs CYM T0

tax_otu=data.frame(read_excel("TablaOTUs.xlsx"))
Samples=tax_otu$ID 
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")


DF.0=tax_otu[c(1:3,13:15),-c(1,2)]
row.names(DF.0)=Samples[c(1:3,13:15)]
colnames(DF.0)=OTUs

taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]

Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]

hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))

Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
            Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))  
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)  
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL


DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]

row.names(DF.0)=Samples[c(1:3,13:15)]

DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
row.names(DF)=Samples[c(1:3,13:15)]


Type=as.factor(c(rep("CAU_T0", 3),rep("CYM_T0", 3)))
colors=c("green","blue")[Type]

After removing OTUs with 2 or fewer reads across the T0 samples and those appearing in only one of these samples, 1472 taxa remain.

The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:

Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
  kbl() %>%
  kable_styling()
Sample OTUs Reads Proportions
CAU_T0_2 OTU966 55 0.0012208
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)

6.3.1 How different are the samples?

HC=ClusterHC(DF,Grups=Type,barplot=FALSE,colores=colors)

separacio.global=Sep(HC)

The two sample types are perfectly separated from the start. The separation between groups is 2.43.

We examine the proportion of OTU subsets of reasonable size that already classify them correctly.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717567904
initial_seed %% 10000
# [1] 7904
set.seed(7904)
Experimento=function(i)
{
  ii=sample(dim(DF)[2],i,rep=FALSE)
  DFtemp=DF[,ii]
  CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
  c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:750)
  {
  print(i)
  EE=replicate(n,Experimento(i))
  EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
  F.SS=rbind(F.SS, c(i,
                     length(EE.2)/250,
                      mean(EE.2),
                      quantile(XX,0.025),
                      quantile(XX,0.975)
  ))
  }
colnames(F.SS)=c("n","proporción","sep. media","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.T0.RData")
F.SS=readRDS("Fraccio.Separadors.T0.RData")

The first plot shows, for each \(n\) up to half the total number of OTUs, the proportion of samples of size \(n\) that perfectly classify CAU_T0 and CYM_T0 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.

F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)


plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

The second plot shows, for each \(n\) up to half the total number of OTUs, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T0 and CYM_T0 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).

fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")

`

6.3.2 OTUs significant according to ALDeX

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717574148
initial_seed %% 10000
# [1] 4148
set.seed(4148)
repl.kw.Cescv2.T0=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex.kw.Cescv2.T0=aldex.kw(repl.kw.Cescv2.T0, verbose=FALSE)
#'
saveRDS(aldex.kw.Cescv2.T0, file="aldex_kw_Cescv2.T0.RData")

Taxa with statistically significant differences:

p.valores_kw=readRDS("aldex_kw_Cescv2.T0.RData") 
length(p.valores_kw[p.valores_kw[,2]<0.05,2]) #K-W adjusted p-value < 0.05
## [1] 0
length(p.valores_kw[p.valores_kw[,1]<0.05,2]) #p-value < 0.05
## [1] 121
length(p.valores_kw[p.valores_kw[,1]<0.01,4]) #p-value < 0.01
## [1] 0

There is no OTU with an adjusted Kruskal–Wallis test p-value < 0.05. There are 121 with an unadjusted p-value < 0.05 (and none < 0.01). They classify the samples correctly:

sep.kw.005=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=FALSE)

kw.005=which(p.valores_kw[,2]<0.001) 

The separation between groups is 3.43.

Significant=data.frame(
  OTUs=paste("OTU",1:n.OTUs.original,sep=""),
  taxa=taxa.original,
  Aldex.kw.0.05=Indicador(kw.005)
)

6.3.3 Taxa with maximum relevant log-ratios

LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_T0.RData")
LRS=readRDS("LRS_T0.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
    assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_T0.RData")
assoc=readRDS("assoc_T0.RData")
plot(5:200,assoc[1:196],type="b",xlab="m",ylab="Bondad mediana de los m más relevantes"
     ,pch=20,cex=0.5)

m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))
DF.Imp=DF[,impLR]
HC=ClusterHC(DF.Imp,dendrograma=FALSE,barplot=FALSE, Grups=Type)
LR.max=c(length(impLR),Sep(HC))

The maximum is attained at the set of 549 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.83.

Note In the plot we can observe a very pronounced local maximum at 46. Let us examine it:

m0=4+which.max(assoc[1:100])
impLR.2=sort(as.numeric(LRS$`order of importance`[1:m0]))
DF.Imp.0=DF[,impLR.2]
HC=ClusterHC(DF.Imp.0,dendrograma=TRUE,barplot=FALSE, Grups=Type)

LR.max.2=c(length(impLR.2),Sep(HC))

They also perfectly separate the two sample types. The separation between groups is 4.66. Fewer taxa, but better separation.

Significant=cbind(Significant,
                     Log_ratio.max=Indicador(impLR),
                     Log_ratio.max2=Indicador(impLR.2))

6.3.4 Taxa relevant for the bacterial signature (according to coda_glmnet)

coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
saveRDS(coda.glmnet, file="coda.glmnet.T0.RData")
coda.glmnet=readRDS("coda.glmnet.T0.RData")
coda.glmnet$`signature plot`

impGLMNET=coda.glmnet$taxa.num
DF.Imp=DF[,impGLMNET]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.no0=c(length(coda.glmnet$taxa.num),Sep(HC))

There are 350 significant taxa.
The separation between groups is 2.78.

Significant=cbind(Significant,
glmnet_sign.no0.CAU=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`>0]),
glmnet_sign.no0.CYM=Indicador(coda.glmnet$taxa.num[coda.glmnet$`log-contrast coefficients`<0]))

Out of curiosity, we use 3 replicates to see what happens.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717687628
initial_seed %% 10000
# [1] 7628
M=3
set.seed(7628)
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(
  repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,
    repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3
  ))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(
  paste("CAU_T0_1",1:M,sep="_"),
  paste("CAU_T0_2",1:M,sep="_"),
  paste("CAU_T0_3",1:M,sep="_"),
  paste("CYM_T0_1",1:M,sep="_"),
  paste("CYM_T0_2",1:M,sep="_"),
  paste("CYM_T0_3",1:M,sep="_")
  )
mostres_ext=rbind(repls,easyCODA::CLR(DF)$LR)
conds_ext=as.factor(c(rep("CAU", 3*M),rep("CYM", 3*M),rep("CAU", 3),rep("CYM", 3)))
colors_ext=c("green","blue")[conds_ext]
rm(repls)
LogRat_mostres_ext_T0=logratios_matrix_clr(mostres_ext)
saveRDS(LogRat_mostres_ext_T0, file="LogRat_mostres_ext_T0.RData")
LogRat_mostres_ext_T0=readRDS("LogRat_mostres_ext_T0.RData")
LogRat_mostres_ext_T0=list(LogRat_mostres_ext_T0[[1]],LogRat_mostres_ext_T0[[2]])

coda.glmnet_ext=coda_glmnet_lr_bin(LogRat_mostres_ext_T0,conds_ext,taxa,showPlots = FALSE)

coda.glmnet_ext$`signature plot`

impGLMNET_ext=coda.glmnet_ext$taxa.num
DF.Imp=DF[,impGLMNET_ext]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext=c(length(impGLMNET_ext),Sep(HC))

There are 104 significant taxa.
The separation between groups is 4.42.

Significant=cbind(Significant,
glmnet_sign.ext.CAU=Indicador(coda.glmnet_ext$taxa.num[coda.glmnet_ext$`log-contrast coefficients`>0]),
glmnet_sign.ext.CYM=Indicador(coda.glmnet_ext$taxa.num[coda.glmnet_ext$`log-contrast coefficients`<0]))

6.4 Simper

DF.prop=DF.0.total[c(1:3,13:15),]/rowSums(DF.0.total[c(1:3,13:15),])
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.07=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)]
indexos_SMP.05=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.5)]
length(indexos_SMP.07)
## [1] 0
length(indexos_SMP.05)
## [1] 0

We find no significant taxon.

6.4.1 Summary

write.csv2(Significant,"OTU_Significativos_CAUT0vsCYMT0.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
LR.max.2,
sep.kw.005,
glmnet.no0,
glmnet.ext)
row.names(Resultados)=c(
"LR.max",
"LR.max.local",
"Aldex",
"glmnet",
"glmnet.ext"
)
colnames(Resultados)=c("Size","Separation")
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,750),xaxp=c(0,750,15),ylim=c(0,7),yaxp=c(0,7,28))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=separacio.global,col="green")

points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.1,col="blue",labels=row.names(Resultados),cex=0.75)
text(20,separacio.global+0.01,col="green",labels="Global",cex=0.75)

Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet.

Significant.capat=Significant[,c(1,2,4,6,7)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

There are 312 taxa. They are:

Significant.capat[,1:2]%>%
  kbl() %>%
  kable_styling() 
OTUs taxa
1 OTU1 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.guncultured.
2 OTU2 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Actibacter.s__uncultured_Bacteroidetes
3 OTU3 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.g__uncultured.s__uncultured_delta
4 OTU4 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__bacterium_AMSU
7 OTU7 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria....
8 OTU8 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__uncultured.s__uncultured_organism
9 OTU9 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gSva0081_sediment_group.
10 OTU10 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.guncultured.
12 OTU12 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Bacteroidetes_BD2.2.gBacteroidetes_BD2.2.
13 OTU13 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.guncultured.
15 OTU15 d__Bacteria.p__Firmicutes.c__Clostridia.o__Clostridia.f__Hungateiclostridiaceae.g__uncultured.s__uncultured_bacterium
16 OTU16 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae..
18 OTU18 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae.g__uncultured.s__uncultured_sediment
19 OTU19 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.guncultured.
20 OTU20 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Robiginitalea.s__uncultured_bacterium
22 OTU22 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatitalea.s__uncultured_Desulfobacteraceae
23 OTU23 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.gLentimicrobiaceae.
24 OTU24 d__Bacteria.p__Bacteroidota.c__Bacteroidia.oBacteroidales...
25 OTU25 d__Archaea.p__Asgardarchaeota.c__Heimdallarchaeia.o__Heimdallarchaeia.f__Heimdallarchaeia.g__Heimdallarchaeia.s__uncultured_archaeon
26 OTU26 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SBR1031.f__SBR1031.g__SBR1031.s__uncultured_organism
28 OTU28 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_sediment
29 OTU29 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Milano.WF1B.44.f__Milano.WF1B.44.g__Milano.WF1B.44.s__uncultured_gamma
30 OTU30 d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__PHOS.HE36.g__PHOS.HE36.s__uncultured_bacterium
31 OTU31 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.oEctothiorhodospirales...
32 OTU32 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.guncultured.
33 OTU33 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidetes_VC2.1_Bac22.f__Bacteroidetes_VC2.1_Bac22.g__Bacteroidetes_VC2.1_Bac22.s__uncultured_Bacteroidetes
35 OTU35 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__LCP.80.s__uncultured_delta
36 OTU36 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Steroidobacterales.f__Woeseiaceae.gWoeseia.
37 OTU37 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__KI89A_clade.f__KI89A_clade.gKI89A_clade.
38 OTU38 Unassigned.__.....__
39 OTU39 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__Lentimicrobiaceae.g__Lentimicrobiaceae.s__uncultured_Bacteroidetes
40 OTU40 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__Pelolinea.s__uncultured_bacterium
41 OTU41 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__B2M28.f__B2M28.gB2M28.
42 OTU42 d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.guncultured.
43 OTU43 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Thioalkalivibrio
44 OTU44 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gSEEP.SRB1.
45 OTU45 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae..
47 OTU47 d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__SCGC_AAA011.D5.g__SCGC_AAA011.D5.s__uncultured_euryarchaeote
49 OTU49 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.g__Candidatus_Thiodiazotropha.s__Candidatus_Thiodiazotropha
50 OTU50 d__Bacteria.p__Sva0485.c__Sva0485.o__Sva0485.f__Sva0485.gSva0485.
52 OTU52 d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Actinomarinales.f__uncultured.guncultured.
53 OTU53 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__uncultured.f__uncultured.guncultured.
54 OTU54 d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Ilumatobacteraceae.gIlumatobacter.
55 OTU55 d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_bacterium
56 OTU56 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Arenicellales.f__Arenicellaceae.guncultured.
57 OTU57 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gEudoraea.
60 OTU60 d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calditrichaceae.s__uncultured_bacterium
62 OTU62 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae..
64 OTU64 d__Eukaryota......
69 OTU69 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae..
70 OTU70 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.oDesulfobacterales...
71 OTU71 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gRobiginitalea.
72 OTU72 d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calorithrix.s__uncultured_bacterium
73 OTU73 d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__BSV26.gBSV26.
77 OTU77 d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Thermoplasmatales
79 OTU79 d__Bacteria.p__Acidobacteriota.c__Subgroup_21.o__Subgroup_21.f__Subgroup_21.g__Subgroup_21.s__uncultured_bacterium
80 OTU80 d__Bacteria......
81 OTU81 d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_bacterium
82 OTU82 d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_delta
83 OTU83 d__Bacteria.p__SAR324_clade.Marine_group_B..c__SAR324_clade.Marine_group_B..o__SAR324_clade.Marine_group_B..f__SAR324_clade.Marine_group_B..gSAR324_clade.Marine_group_B..
84 OTU84 d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__uncultured_bacterium
85 OTU85 d__Bacteria.pChloroflexi.....
88 OTU88 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gSpirochaeta_2.
94 OTU94 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta.s__uncultured_bacterium
95 OTU95 d__Bacteria.p__Cloacimonadota.c__Cloacimonadia.o__Cloacimonadales.f__MSBL8.g__MSBL8.s__uncultured_bacterium
98 OTU98 d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.gAcetothermiia.
102 OTU102 d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Melioribacteraceae.g__IheB3.7.s__uncultured_Chlorobi
104 OTU104 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae..
105 OTU105 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.gHalochromatium.
108 OTU108 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gGWE2.31.10.
111 OTU111 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Ardenticatenales.f__uncultured.guncultured.
113 OTU113 d__Archaea.p__Altiarchaeota.c__Altiarchaeia.o__Altiarchaeales.f__Altiarchaeaceae.g__Candidatus_Altiarchaeum.s__uncultured_archaeon
116 OTU116 d__Bacteria.p__Zixibacteria.c__Zixibacteria.o__Zixibacteria.f__Zixibacteria.g__Zixibacteria.s__uncultured_bacterium
118 OTU118 d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Marine_Benthic_Group_D_and_DHVEG.1.f__Marine_Benthic_Group_D_and_DHVEG.1.g__Marine_Benthic_Group_D_and_DHVEG.1.s__uncultured_archaeon
119 OTU119 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.g__uncultured.s__uncultured_Bacteroidetes
120 OTU120 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.gDesulfobulbus.
122 OTU122 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.guncultured.
123 OTU123 d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gPir4_lineage.
124 OTU124 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__GWE2.31.10.s__uncultured_gamma
130 OTU130 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__SAR11_clade.f__Clade_III.gClade_III.
132 OTU132 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__BD7.8.f__BD7.8.gBD7.8.
133 OTU133 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.gDesulfatiglans.
135 OTU135 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__SAR11_clade.f__Clade_I.gClade_Ia.
136 OTU136 d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__uncultured_bacterium
139 OTU139 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__Micavibrionaceae.g__uncultured.s__uncultured_bacterium
140 OTU140 d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Cyanobacteriaceae..
144 OTU144 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gAquibacter.
146 OTU146 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__GWE2.31.10.s__uncultured_bacterium
147 OTU147 d__Bacteria.p__Nitrospirota.c__Thermodesulfovibrionia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_Nitrospirae
148 OTU148 d__Bacteria.p__Verrucomicrobiota.c__Kiritimatiellae.o__Kiritimatiellales.f__Kiritimatiellaceae.gR76.B128.
149 OTU149 d__Bacteria.p__Acidobacteriota.c__Subgroup_18.o__Subgroup_18.f__Subgroup_18.gSubgroup_18.
150 OTU150 d__Archaea......
152 OTU152 d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Pedosphaerales.f__Pedosphaeraceae.g__DEV008.s__uncultured_bacterium
153 OTU153 d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__uncultured.f__uncultured.g__uncultured.s__uncultured_archaeon
155 OTU155 d__Bacteria.p__Bacteroidota.c__Ignavibacteria.o__Ignavibacteriales.f__Melioribacteraceae.g__IheB3.7.s__uncultured_sediment
156 OTU156 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__uncultured_bacterium
157 OTU157 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__SEEP.SRB1.s__Olavius_ilvae
158 OTU158 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Sediminispirochaeta.s__uncultured_bacterium
159 OTU159 d__Bacteria.pAcidobacteriota.....
160 OTU160 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_Alphaproteobacteria
161 OTU161 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Caldilineales.f__Caldilineaceae.guncultured.
162 OTU162 d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.g__PAUC43f_marine_benthic_group.s__uncultured_actinobacterium
168 OTU168 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiomicrospirales.f__Thiomicrospiraceae.gendosymbionts.
169 OTU169 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.guncultured.
173 OTU173 d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Caldithrix.s__uncultured_Deferribacteres
175 OTU175 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Hyphomonadaceae..
181 OTU181 d__Bacteria.p__Chloroflexi.c__Anaerolineae....
185 OTU185 d__Bacteria.p__Patescibacteria.c__WWE3.o__WWE3.f__WWE3.g__WWE3.s__uncultured_bacterium
189 OTU189 d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Defluviitaleaceae.g__Defluviitaleaceae_UCG.011.s__uncultured_bacterium
190 OTU190 d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.g__uncultured.s__bacterium_enrichment
191 OTU191 d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__Woesearchaeales.g__Woesearchaeales.s__uncultured_crenarchaeote
192 OTU192 d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gRhodopirellula.
194 OTU194 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__Spirochaeta_isovalerica
204 OTU204 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SBR1031.f__SBR1031.g__SBR1031.s__metagenome
205 OTU205 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.gSediminispirochaeta.
208 OTU208 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gLutibacter.
211 OTU211 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfosarcina.s__uncultured_marine
213 OTU213 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.gDesulfatitalea.
216 OTU216 d__Bacteria.p__Actinobacteriota.c__WCHB1.81.o__WCHB1.81.f__WCHB1.81.g__WCHB1.81.s__uncultured_bacterium
217 OTU217 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.g__Desulfobulbus.s__uncultured_bacterium
222 OTU222 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Prolixibacteraceae.gSunxiuqinia.
226 OTU226 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__KZNMV.5.B42.f__KZNMV.5.B42.gKZNMV.5.B42.
228 OTU228 d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__Oligosphaerales.f__Lenti.02.g__Lenti.02.s__uncultured_bacterium
230 OTU230 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_Bacteroidetes
231 OTU231 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae..
232 OTU232 d__Bacteria.p__Chloroflexi.c__KD4.96.o__KD4.96.f__KD4.96.gKD4.96.
236 OTU236 d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__MSB.3C8.gMSB.3C8.
237 OTU237 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.g__uncultured.s__uncultured_marine
239 OTU239 d__Bacteria.p__Zixibacteria.c__Zixibacteria.o__Zixibacteria.f__Zixibacteria.gZixibacteria.
242 OTU242 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Gammaproteobacteria_Incertae_Sedis.f__Unknown_Family.g__Unknown_Family.s__uncultured_bacterium
249 OTU249 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Sedimenticolaceae.guncultured.
252 OTU252 d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__DEV007.gDEV007.
253 OTU253 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfatiglandales.f__Desulfatiglandaceae.g__Desulfatiglans.s__uncultured_delta
260 OTU260 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.g__CCM11a.s__uncultured_organism
266 OTU266 d__Bacteria.p__Patescibacteria.c__ABY1.o__Candidatus_Kerfeldbacteria.f__Candidatus_Kerfeldbacteria.g__Candidatus_Kerfeldbacteria.s__uncultured_bacterium
267 OTU267 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Hyphomonadaceae.guncultured.
274 OTU274 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.g__Thermomarinilinea.s__uncultured_bacterium
276 OTU276 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria....
277 OTU277 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Kiloniellales.f__Kiloniellaceae.gTistlia.
284 OTU284 d__Bacteria.p__LCP.89.c__LCP.89.o__LCP.89.f__LCP.89.g__LCP.89.s__saltmarsh_clone
292 OTU292 d__Archaea.p__Thermoplasmatota.c__Thermoplasmatota.o__Thermoplasmatota.f__Thermoplasmatota.g__uncultured.s__uncultured_archaeon
293 OTU293 d__Archaea.p__Aenigmarchaeota.c__Deep_Sea_Euryarchaeotic_Group.DSEG..o__Deep_Sea_Euryarchaeotic_Group.DSEG..f__Deep_Sea_Euryarchaeotic_Group.DSEG..g__Deep_Sea_Euryarchaeotic_Group.DSEG..s__uncultured_archaeon
300 OTU300 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Sediminispirochaeta.s__uncultured_Spirochaetes
304 OTU304 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gMaritimimonas.
305 OTU305 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_organism
307 OTU307 d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.oWoesearchaeales...
314 OTU314 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__UBA10353_marine_group.f__UBA10353_marine_group.g__UBA10353_marine_group.s__uncultured_bacterium
318 OTU318 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.g__Filomicrobium.s__uncultured_Alphaproteobacteria
328 OTU328 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Desulfatirhabdium.s__uncultured_delta
334 OTU334 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__MSBL9.f__SG8.4.gSG8.4.
337 OTU337 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__SJA.15.f__SJA.15.g__SJA.15.s__uncultured_Chloroflexi
340 OTU340 d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Absconditabacteriales_.SR1..f_Absconditabacteriales.SR1..g_Absconditabacteriales.SR1..s__uncultured_bacterium
341 OTU341 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Crocinitomicaceae.g__Crocinitomix.s__uncultured_bacterium
353 OTU353 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Nitrincolaceae.guncultured.
354 OTU354 d__Bacteria.p__Spirochaetota.c__Spirochaetia.o__Spirochaetales.f__Spirochaetaceae.g__Spirochaeta_2.s__uncultured_Spirochaetes
356 OTU356 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.g__CCM11a.s__uncultured_bacterium
358 OTU358 d__Bacteria.p__Elusimicrobiota.c__Lineage_IIb.o__Lineage_IIb.f__Lineage_IIb.g__Lineage_IIb.s__uncultured_bacterium
368 OTU368 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gHIMB11.
371 OTU371 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Maritimimonas.s__uncultured_organism
372 OTU372 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Pseudohongiellaceae.gPseudohongiella.
381 OTU381 d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Candidatus_Peregrinibacteria.f__Candidatus_Peregrinibacteria.g__Candidatus_Peregrinibacteria.s__uncultured_bacterium
386 OTU386 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Vibrionales.f__Vibrionaceae.gVibrio.
389 OTU389 d__Bacteria.p__Bdellovibrionota.c__Bdellovibrionia.o__Bacteriovoracales.f__Bacteriovoracaceae.g__Peredibacter.s__uncultured_bacterium
390 OTU390 d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_organism
392 OTU392 d__Archaea.p__Asgardarchaeota.c__Odinarchaeia.o__Odinarchaeia.f__Odinarchaeia.g__Odinarchaeia.s__uncultured_marine
393 OTU393 d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__uncultured_actinobacterium
394 OTU394 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfosarcinaceae.g__Sva0081_sediment_group.s__uncultured_marine
397 OTU397 d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.gNB1.j.
400 OTU400 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__CCM11a.f__CCM11a.gCCM11a.
401 OTU401 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Marinilabiliaceae.g__Labilibacter.s__uncultured_Bacteroidetes
402 OTU402 d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__uncultured_organism
421 OTU421 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__uncultured.guncultured.
424 OTU424 d__Bacteria.p__Gemmatimonadota.c__PAUC43f_marine_benthic_group.o__PAUC43f_marine_benthic_group.f__PAUC43f_marine_benthic_group.gPAUC43f_marine_benthic_group.
426 OTU426 d__Bacteria.p__Verrucomicrobiota.c__Omnitrophia.o__Omnitrophales.f__Omnitrophaceae.g__Candidatus_Omnitrophus.s__uncultured_bacterium
435 OTU435 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.gDesulfobacter.
436 OTU436 d__Bacteria.p__Desulfobacterota.c__Desulfuromonadia.o__Bradymonadales.f__Bradymonadales.gBradymonadales.
448 OTU448 d__Bacteria.p__Patescibacteria.c__ABY1.o__Candidatus_Falkowbacteria.f__Candidatus_Falkowbacteria.gCandidatus_Falkowbacteria.
453 OTU453 d__Bacteria.p__Modulibacteria.c__Moduliflexia.o__Moduliflexales.f__Moduliflexaceae.gModuliflexaceae.
463 OTU463 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__RBG.13.54.9.f__RBG.13.54.9.g__RBG.13.54.9.s__uncultured_bacterium
475 OTU475 d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.g__SCGC.AB.539.J10.s__uncultured_bacterium
476 OTU476 d__Bacteria.p__Myxococcota.c__Polyangia.o__MidBa8.f__MidBa8.g__MidBa8.s__uncultured_delta
480 OTU480 d__Bacteria.p__Calditrichota.c__Calditrichia.o__Calditrichales.f__Calditrichaceae.g__Calditrichaceae.s__uncultured_organism
482 OTU482 d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Chloroplast.f__Chloroplast.g__Chloroplast.s__Chromerida_sp.
483 OTU483 d__Bacteria.p__Bdellovibrionota.c__Oligoflexia.o__Oligoflexales.f__uncultured.g__uncultured.s__uncultured_organism
490 OTU490 d__Bacteria.p__Bdellovibrionota.c__Bdellovibrionia.o__Bacteriovoracales.f__Bacteriovoracaceae.gPeredibacter.
495 OTU495 d__Archaea.p__Micrarchaeota.c__Micrarchaeia.o__Micrarchaeales.f__Micrarchaeales.g__Micrarchaeales.s__uncultured_euryarchaeote
501 OTU501 d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_Bacteroidetes
503 OTU503 d__Bacteria.p__Fibrobacterota.c__Chitinivibrionia.o__Chitinivibrionales.f__Chitinivibrionaceae.g__possible_genus_03.s__uncultured_bacterium
508 OTU508 d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__DEV007.g__DEV007.s__uncultured_organism
509 OTU509 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodospirillales.f__Rhodospirillaceae.g__Candidatus_Riegeria.s__uncultured_bacterium
530 OTU530 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiomicrospirales.f__Thioglobaceae.gSUP05_cluster.
532 OTU532 d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__bacterium_enrichment
537 OTU537 d__Bacteria.p__Fermentibacterota.c__Fermentibacteria.o__Fermentibacterales.f__Fermentibacteraceae.g__Fermentibacteraceae.s__uncultured_sediment
539 OTU539 d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.gCandidatus_Kaiserbacteria.
541 OTU541 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ga0077536.f__Ga0077536.g__Ga0077536.s__uncultured_gamma
547 OTU547 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Rhizobiales_Incertae_Sedis.gAnderseniella.
553 OTU553 d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Helicobacteraceae..
554 OTU554 d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.g__AB.539.J10.s__uncultured_bacterium
565 OTU565 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__Portibacter.s__uncultured_Bacteroidetes
568 OTU568 d__Bacteria.p__Margulisbacteria.c__Margulisbacteria.o__Margulisbacteria.f__Margulisbacteria.g__Margulisbacteria.s__uncultured_bacterium
570 OTU570 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Thermoflexales.f__Thermoflexaceae.gThermoflexus.
574 OTU574 d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.g__uncultured.s__uncultured_organism
585 OTU585 d__Bacteria.p__Bacteroidota.c__Kryptonia.o__Kryptoniales.f__MSB.3C8.g__MSB.3C8.s__uncultured_bacterium
618 OTU618 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Ectothiorhodospirales.f__Ectothiorhodospiraceae.g__Thiogranum.s__uncultured_gamma
620 OTU620 d__Bacteria.p__Firmicutes.c__Clostridia.o__Lachnospirales.f__Lachnospiraceae.g__uncultured.s__Clostridiales_bacterium
622 OTU622 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__uncultured.g__uncultured.s__uncultured_Bacteroidetes
628 OTU628 d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Rs.M59_termite_group.g__Rs.M59_termite_group.s__uncultured_bacterium
631 OTU631 d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Campylobacteraceae.g__Campylobacter.s__uncultured_Epsilonproteobacteria
657 OTU657 d__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__OPB41.f__OPB41.g__OPB41.s__unidentified_marine
663 OTU663 d__Bacteria.p__Acidobacteriota.c__Vicinamibacteria.o__Subgroup_9.f__Subgroup_9.g__Subgroup_9.s__uncultured_bacterium
694 OTU694 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__NS5_marine_group.s__uncultured_Flavobacterium
700 OTU700 d__Bacteria.p__SAR324_clade.Marine_group_B..c__SAR324_clade.Marine_group_B..o__SAR324_clade.Marine_group_B..f__SAR324_clade.Marine_group_B..g__SAR324_clade.Marine_group_B..s__uncultured_organism
706 OTU706 d__Bacteria.p__Myxococcota.c__Polyangia.o__UASB.TL25.f__UASB.TL25.gUASB.TL25.
708 OTU708 d__Bacteria.p__Patescibacteria.c__Gracilibacteria.o__Gracilibacteria.f__Gracilibacteria.g__Gracilibacteria.s__uncultured_organism
714 OTU714 d__Bacteria.p__Verrucomicrobiota.c__Omnitrophia.o__Omnitrophales.f__Omnitrophales.g__Omnitrophales.s__uncultured_bacterium
729 OTU729 d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__uncultured_organism
742 OTU742 d__Bacteria.p__Planctomycetota.c__Pla4_lineage.o__Pla4_lineage.f__Pla4_lineage.g__Pla4_lineage.s__uncultured_bacterium
750 OTU750 d__Bacteria.p__Desulfobacterota.c__Syntrophobacteria.o__Syntrophobacterales.f__uncultured.g__uncultured.s__uncultured_delta
761 OTU761 d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Microtrichales.f__Microtrichaceae.gSva0996_marine_group.
776 OTU776 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodospirillales.f__Terasakiellaceae.guncultured.
789 OTU789 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfocapsaceae.g__Desulfopila.s__uncultured_bacterium
794 OTU794 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gMarivita.
803 OTU803 d__Archaea.p__Aenigmarchaeota.c__Aenigmarchaeia.o__Aenigmarchaeales.f__Aenigmarchaeales.gAenigmarchaeales.
814 OTU814 d__Archaea.p__Aenigmarchaeota.c__Aenigmarchaeia.o__Aenigmarchaeales.f__Aenigmarchaeales.g__Aenigmarchaeales.s__uncultured_archaeon
823 OTU823 d__Bacteria.p__WS2.c__WS2.o__WS2.f__WS2.g__WS2.s__uncultured_Firmicutes
837 OTU837 d__Bacteria.p__Latescibacterota.c__Latescibacteria.o__Latescibacterales.f__Latescibacteraceae.g__Latescibacteraceae.s__bacterium_enrichment
840 OTU840 d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__SAR202_clade.f__SAR202_clade.gSAR202_clade.
845 OTU845 d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Vermiphilaceae.gVermiphilaceae.
897 OTU897 d__Bacteria.p__Chloroflexi.c__Dehalococcoidia.o__GIF9.f__AB.539.J10.gAB.539.J10.
905 OTU905 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Granulosicoccales.f__Granulosicoccaceae.g__Granulosicoccus.s__uncultured_bacterium
908 OTU908 d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.oOpitutales...
933 OTU933 d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurimonadaceae.g__Sulfurimonas.s__bacterium_enrichment
937 OTU937 d__Bacteria.p__Acidobacteriota.c__Acidobacteriae....
950 OTU950 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__CH2b56.f__CH2b56.g__CH2b56.s__uncultured_bacterium
954 OTU954 d__Bacteria.p__Planctomycetota.c__vadinHA49.o__vadinHA49.f__vadinHA49.g__vadinHA49.s__uncultured_organism
967 OTU967 d__Bacteria.p__Planctomycetota.c__Pla3_lineage.o__Pla3_lineage.f__Pla3_lineage.g__Pla3_lineage.s__uncultured_planctomycete
983 OTU983 d__Bacteria.p__Desulfobacterota.c__Desulfobulbia.o__Desulfobulbales.f__Desulfobulbaceae.g__uncultured.s__uncultured_gamma
987 OTU987 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__uncultured.s__uncultured_organism
988 OTU988 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Sphingomonadales.f__Sphingomonadaceae.g__Erythrobacter.s__Erythrobacter_longus
989 OTU989 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__Lewinella.s__uncultured_organism
991 OTU991 d__Bacteria.p__Acetothermia.c__Acetothermiia.o__Acetothermiia.f__Acetothermiia.g__Acetothermiia.s__uncultured_bacterium
993 OTU993 d__Bacteria.p__Sumerlaeota.c__Sumerlaeia.o__Sumerlaeales.f__Sumerlaeaceae.g__Sumerlaea.s__uncultured_organism
998 OTU998 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.g__Flavobacteriaceae.s__Aureicoccus_marinus
1000 OTU1000 d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Synechococcales.f__Cyanobiaceae.gCyanobium_PCC.6307.
1001 OTU1001 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Paracaedibacterales.f__Paracaedibacteraceae.g__Candidatus_Captivus.s__uncultured_bacterium
1003 OTU1003 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Sphingobacteriales.f__NS11.12_marine_group.g__NS11.12_marine_group.s__uncultured_marine
1013 OTU1013 d__Archaea.p__Nanoarchaeota.c__Nanoarchaeia.o__Woesearchaeales.f__GW2011_GWC1_47_15.gGW2011_GWC1_47_15.
1014 OTU1014 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.oCellvibrionales...
1041 OTU1041 d__Bacteria.p__Verrucomicrobiota.c__Verrucomicrobiae.o__Verrucomicrobiales.f__Rubritaleaceae.g__uncultured.s__uncultured_bacterium
1050 OTU1050 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__uncultured.g__uncultured.s__uncultured_bacterium
1057 OTU1057 d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Babeliales.g__Babeliales.s__uncultured_organism
1064 OTU1064 d__Bacteria.p__Dependentiae.c__Babeliae.o__Babeliales.f__Vermiphilaceae.g__Vermiphilaceae.s__uncultured_delta
1070 OTU1070 d__Bacteria.p__Cyanobacteria.c__Vampirivibrionia.o__Gastranaerophilales.f__Gastranaerophilales.gGastranaerophilales.
1086 OTU1086 d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.gRubripirellula.
1097 OTU1097 d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__unidentified_bacterium
1100 OTU1100 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Cytophagales.f__Cyclobacteriaceae.g__uncultured.s__uncultured_soil
1108 OTU1108 d__Bacteria.p__NB1.j.c__NB1.j.o__NB1.j.f__NB1.j.g__NB1.j.s__uncultured_prokaryote
1114 OTU1114 d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__SS1.B.02.17.f__SS1.B.02.17.g__SS1.B.02.17.s__uncultured_Lentisphaerae
1121 OTU1121 d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Eel.36e1D6.g__Eel.36e1D6.s__uncultured_Polyangiaceae
1127 OTU1127 d__Bacteria.p__Actinobacteriota.c__WCHB1.81.o__WCHB1.81.f__WCHB1.81.g__WCHB1.81.s__uncultured_actinobacterium
1129 OTU1129 d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Marine_Benthic_Group_D_and_DHVEG.1.f__Marine_Benthic_Group_D_and_DHVEG.1.g__Marine_Benthic_Group_D_and_DHVEG.1.s__archaeon_enrichment
1142 OTU1142 d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurimonadaceae..
1150 OTU1150 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Chromatiales.f__Chromatiaceae.g__Candidatus_Thiobios.s__uncultured_gamma
1160 OTU1160 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Caulobacterales.f__Parvularculaceae.gParvularcula.
1162 OTU1162 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhodobacterales.f__Rhodobacteraceae.gRoseobacter.
1173 OTU1173 d__Bacteria.p__Actinobacteriota.c__Acidimicrobiia.o__Actinomarinales.f__uncultured.g__uncultured.s__bacterium_WH6.7
1178 OTU1178 d__Bacteria.p__Firmicutes.c__Bacilli.o__Izemoplasmatales.f__Izemoplasmataceae.g__Izimaplasma.s__Candidatus_Izimaplasma
1189 OTU1189 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfobacteraceae.g__Desulfospira.s__uncultured_bacterium
1194 OTU1194 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Oceanospirillales.f__Saccharospirillaceae.gReinekea.
1196 OTU1196 d__Bacteria.p__CK.2C2.2.c__CK.2C2.2.o__CK.2C2.2.f__CK.2C2.2.g__CK.2C2.2.s__uncultured_organism
1198 OTU1198 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Cryomorphaceae.g__Vicingus.s__uncultured_sediment
1205 OTU1205 d__Bacteria.p__Chloroflexi.c__Chloroflexia.oThermomicrobiales...
1226 OTU1226 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Rhizobiales.f__Hyphomicrobiaceae.gFilomicrobium.
1228 OTU1228 d__Bacteria.p__Planctomycetota.c__Planctomycetes.o__Pirellulales.f__Pirellulaceae.g__Rhodopirellula.s__uncultured_hydrocarbon
1235 OTU1235 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Micavibrionales.f__uncultured.g__uncultured.s__uncultured_organism
1238 OTU1238 d__Bacteria.p__Acidobacteriota.c__Thermoanaerobaculia.o__Thermoanaerobaculales.f__Thermoanaerobaculaceae.g__Subgroup_23.s__uncultured_prokaryote
1240 OTU1240 d__Bacteria.p__Myxococcota.c__Polyangia.o__VHS.B4.70.f__VHS.B4.70.g__VHS.B4.70.s__uncultured_delta
1248 OTU1248 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__mle1.8.f__mle1.8.g__mle1.8.s__uncultured_organism
1249 OTU1249 d__Bacteria.p__Myxococcota.c__Polyangia.o__Polyangiales.f__Sandaracinaceae.gSandaracinus.
1250 OTU1250 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Nitrosococcales.f__Nitrosococcaceae.gCI75cm.2.12.
1284 OTU1284 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Steroidobacterales.f__Woeseiaceae.g__JTB255_marine_benthic_group.s__uncultured_organism
1285 OTU1285 d__Bacteria.p__Fibrobacterota.c__Fibrobacteria.o__Fibrobacterales.f__TG3.gTG3.
1287 OTU1287 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__OM182_clade.f__OM182_clade.g__OM182_clade.s__unidentified_marine
1289 OTU1289 d__Bacteria.p__Verrucomicrobiota.c__Lentisphaeria.o__SS1.B.02.17.f__SS1.B.02.17.gSS1.B.02.17.
1298 OTU1298 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Chitinophagales.f__Saprospiraceae.g__uncultured.s__uncultured_Bacteroidetes
1299 OTU1299 d__Bacteria.p__Verrucomicrobiota.c__Kiritimatiellae.o__Kiritimatiellales.f__Kiritimatiellaceae.g__R76.B128.s__Kiritimatiella_sp.
1301 OTU1301 d__Bacteria.p__Fusobacteriota.c__Fusobacteriia.o__Fusobacteriales.f__Fusobacteriaceae.gFusobacterium.
1302 OTU1302 d__Archaea.p__Asgardarchaeota.c__Lokiarchaeia.o__Lokiarchaeia.f__Lokiarchaeia.g__Lokiarchaeia.s__uncultured_crenarchaeote
1303 OTU1303 d__Bacteria.p__Desulfobacterota.c__Desulfuromonadia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_bacterium
1308 OTU1308 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__mle1.8.f__mle1.8.gmle1.8.
1310 OTU1310 d__Bacteria.p__Spirochaetota.c__MVP.15.o__MVP.15.f__MVP.15.g__MVP.15.s__uncultured_bacterium
1312 OTU1312 d__Bacteria.p__Campilobacterota.c__Campylobacteria.o__Campylobacterales.f__Sulfurospirillaceae.gSulfurospirillum.
1314 OTU1314 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Nitrosococcales.f__Nitrosococcaceae.g__SZB85.s__uncultured_bacterium
1318 OTU1318 d__Archaea.p__Thermoplasmatota.c__Thermoplasmata.o__Methanomassiliicoccales.f__uncultured.g__uncultured.s__uncultured_euryarchaeote
1321 OTU1321 d__Bacteria.p__Cyanobacteria.c__Cyanobacteriia.o__Cyanobacteriales.f__Xenococcaceae.g__Chroococcidiopsis_PCC.6712.s__uncultured_cyanobacterium
1325 OTU1325 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__uncultured.f__uncultured.guncultured.
1329 OTU1329 d__Bacteria.p__Desulfobacterota.c__Desulfobacteria.o__Desulfobacterales.f__Desulfococcaceae.gDesulfonema.
1330 OTU1330 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Thiotrichales.f__Thiotrichaceae..
1338 OTU1338 d__Bacteria.p__Myxococcota.c__Polyangia.o__Blfdi19.f__Blfdi19.g__Blfdi19.s__uncultured_bacterium
1343 OTU1343 d__Bacteria.p__Acidobacteriota.c__Thermoanaerobaculia.o__Thermoanaerobaculales.f__Thermoanaerobaculaceae.g__B276.D12.s__uncultured_bacterium
1347 OTU1347 d__Bacteria.p__Fibrobacterota.c__Chitinivibrionia.o__uncultured.f__uncultured.guncultured.
1350 OTU1350 d__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Flavobacteriales.f__Flavobacteriaceae.gPolaribacter.
1351 OTU1351 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Phycisphaerales.f__AKAU3564_sediment_group.g__AKAU3564_sediment_group.s__uncultured_bacterium
1352 OTU1352 d__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__D90.f__D90.g__D90.s__uncultured_gamma
1353 OTU1353 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Amsterdam.1B.07.f__Amsterdam.1B.07.g__Amsterdam.1B.07.s__uncultured_bacterium
1359 OTU1359 d__Bacteria.p__Patescibacteria.c__Parcubacteria.o__Candidatus_Kaiserbacteria.f__Candidatus_Kaiserbacteria.g__Candidatus_Kaiserbacteria.s__bacterium_SH4.10
1361 OTU1361 d__Bacteria.p__Chloroflexi.c__Anaerolineae.o__Anaerolineales.f__Anaerolineaceae.gPelolinea.
1362 OTU1362 d__Bacteria.p__Gemmatimonadota.c__BD2.11_terrestrial_group.o__BD2.11_terrestrial_group.f__BD2.11_terrestrial_group.g__BD2.11_terrestrial_group.s__unidentified_bacterium
1363 OTU1363 d__Bacteria.p__Sumerlaeota.c__Sumerlaeia.o__uncultured.f__uncultured.g__uncultured.s__uncultured_organism
1369 OTU1369 d__Bacteria.p__Planctomycetota.c__Phycisphaerae.o__Phycisphaerales.f__Phycisphaeraceae.g__uncultured.s__uncultured_bacterium
1370 OTU1370 d__Bacteria.p__Latescibacterota.c__Latescibacterota.o__Latescibacterota.f__Latescibacterota.g__Latescibacterota.s__uncultured_Acidobacterium
1374 OTU1374 d__Bacteria.p__Proteobacteria.c__Alphaproteobacteria.o__Defluviicoccales.f__Defluviicoccaceae.g__Defluviicoccus.s__uncultured_Rhodospirillaceae

The separation between groups is 3.12.

We update the plot with the separations:

intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR.max",
"LR.max.local",
"Aldex",
"glmnet",
"glmnet.ext",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)

Resultados %>%
  kbl() %>%
  kable_styling()
Size Separation
LR.max 549 2.829712
LR.max.local 46 4.655234
Aldex 121 3.429094
glmnet 350 2.776603
glmnet.ext 104 4.421816
intersection 312 3.116542

6.5 CAU T2C vs CAU T2HW

tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[1:12,]

Samples=tax_otu$ID 
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")

DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs

taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]

Miseria=as.vector(which(colSums(DF.0)<=2))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]

hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))

Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
            Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))  
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)  
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL


DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]

row.names(DF.0)=Samples

DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[7:12,]
DF.0=DF.0[7:12,]
DF.0.original=DF.0.original[7:12 ,]

Type=as.factor(c(rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors=c("purple","red")[Type]

After removing OTUs with 2 or fewer reads across the CAU samples and those appearing in only one of these samples, 1682 taxa remain.

The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:

Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
  kbl() %>%
  kable_styling()
Sample OTUs Reads Proportions
2 CAU_T0_2 OTU966 55 0.0012193
1 CAU_T2HW_1 OTU727 55 0.0008798
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)

6.5.1 How different are the samples?

HC=ClusterHC(DF,Grups=Type,colores=colors)

base=Sep(HC)

The two sample types are NOT well separated.

We examine the proportion of OTU subsets of reasonable size that already classify them correctly.

# Fijo una semilla de aleatoriedad para el experimento
> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1715851715853234
initial_seed %% 10000
# [1] 3234
set.seed(3234)
Experimento=function(i)
{
  ii=sample(dim(DF)[2],i,rep=FALSE)
  DFtemp=DF[,ii]
  CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
  c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=500
F.SS=c()
for (i in 10:1000)
  {
  print(i)
  EE=replicate(n,Experimento(i))
  EE.2=EE[2,which(EE[1,]==0)]
  if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
  F.SS=rbind(F.SS, c(i,
                     length(EE.2)/250,
                      mean(EE.2),
                      quantile(XX,0.025),
                      quantile(XX,0.975)
  ))}
  else {
    F.SS=rbind(F.SS, c(i,
                     0,
                      NA,
                      NA,
                      NA))
  }
  }
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAU.T2CvsT2HW.RData")
F.SS=readRDS("Fraccio.Separadors.CAU.T2CvsT2HW.RData")

The first plot shows, for each \(n\) up to 600, the proportion of samples of size \(n\) that perfectly classify CAU_T2C and CAU_T2HW (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.

F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)


plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T2C and CAU_T2HW (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve). Note that from \(n=350\) onward, NO subset is found that separates the two groups.

fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

`

6.5.2 Taxa with maximum relevant log-ratios

LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAU_2C2HW.RData")
LRS=readRDS("LRS_CAU_2C2HW.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
    assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CAU_2C2HW.RData")
assoc=readRDS("assoc_CAU_2C2HW.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)

m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))

HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)

LR.max=c(length(impLR),Sep(HC))

The maximum is attained at the set of 95 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 1.98.

Significant=data.frame(
  OTUs=paste("OTU",1:n.OTUs.original,sep=""),
  taxa=taxa.original,
  Log_ratio.max=Indicador(impLR))

6.5.3 Taxa relevant for the bacterial signature (according to coda_glmnet)

coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL

coda_glmnet does not find taxa with significant differences, likely due to the low power with only 3 vs. 3 samples. When this happens, we add random replicates, with the idea that stochastic resonance may strengthen the signal. We test with 3, 5, and 10 replicates.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711435693
initial_seed %% 10000
# [1] 5693
# 3 replicates
set.seed(5693)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
  paste("CAU_T2C_2",1:M,sep="_"),
  paste("CAU_T2C_3",1:M,sep="_"),
  paste("CAU_T2HW_1",1:M,sep="_"),
  paste("CAU_T2HW_2",1:M,sep="_"),
  paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_3copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])

coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)

impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))

With 3 replicates, 42 significant taxa are found, perfectly separating the two groups, with a separation of 1.49.

# 5 replicates
set.seed(5693)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
  paste("CAU_T2C_2",1:M,sep="_"),
  paste("CAU_T2C_3",1:M,sep="_"),
  paste("CAU_T2HW_1",1:M,sep="_"),
  paste("CAU_T2HW_2",1:M,sep="_"),
  paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_5copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])

coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)

impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))

With 5 replicates, 53 significant taxa are found, perfectly separating the two groups, with a separation of 1.2.

Significant=cbind(Significant,
glmnet_sign.ext.5.T2C=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T2HW=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
set.seed(5693)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T2C_1,repls$CAU_T2C_2,repls$CAU_T2C_3,repls$CAU_T2HW_1,repls$CAU_T2HW_2,repls$CAU_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T2C_1",1:M,sep="_"),
  paste("CAU_T2C_2",1:M,sep="_"),
  paste("CAU_T2C_3",1:M,sep="_"),
  paste("CAU_T2HW_1",1:M,sep="_"),
  paste("CAU_T2HW_2",1:M,sep="_"),
  paste("CAU_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T2C", 3*M),rep("CAU_T2HW", 3*M),rep("CAU_T2C", 3),rep("CAU_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_2C2HW_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_2C2HW_10copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])

coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)

impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))

With 10 replicates, 59 significant taxa are found, perfectly separating the two groups, with a separation of 1.12.

Significant=cbind(Significant,
glmnet_sign.ext.10.T2C=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T2HW=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))

6.5.4 OTUs significant according to ALDeX

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711461735
initial_seed %% 10000
# [1] 1735
set.seed(1735)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(repl_kw, file="repl_kw_CAU_2C2HW.RData")
saveRDS(aldex_kw, file="aldex_kw_CAU_2C2HW.RData")
p.valores_kw=readRDS("aldex_kw_CAU_2C2HW.RData") 
min(p.valores_kw[,1])
## [1] 0.04999719
min(p.valores_kw[,2])
## [1] 0.3641574
min(p.valores_kw[,3])
## [1] 0.002010094
min(p.valores_kw[,4])
## [1] 0.07355742
length(p.valores_kw[p.valores_kw[,3]<0.01,2])
## [1] 11
length(p.valores_kw[p.valores_kw[,3]<0.05,2])
## [1] 68

There are no taxa with even an unadjusted p-value < 0.05.

6.5.5 Simper

DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP.07=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)]
indexos_SMP.05=SMP.prop$CAU_T2C_CAU_T2HW$ord[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.5)]

OTUs_SMP.07=sort(attr(SMP.prop$CAU_T2C_CAU_T2HW$cusum[which(SMP.prop$CAU_T2C_CAU_T2HW$cusum<=0.7)],"names"))
OTUs_SMP.good.07=setdiff(OTUs_SMP.07,Unics$OTUs)

simper identifies 170 significant taxa, but they do not separate the samples well.

HC=ClusterHC(DF[,OTUs_SMP.good.07],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

Significant=cbind(Significant,
t(DF.prop))

6.5.6 Summary

write.csv2(Significant,"OTU_Significativos_CAU_T2.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10

)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10")
colnames(Resultados)=c("Size","Separation")
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)

Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates

Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

There are 10 taxa. They are:

Significant.capat[,1:2]%>%
  kbl() %>%
  kable_styling() 
OTUs taxa
OTU38 OTU38 Unassigned;;;;;;
OTU108 OTU108 d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;gGWE2-31-10;
OTU199 OTU199 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;gCobetia;
OTU208 OTU208 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gLutibacter;
OTU337 OTU337 d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__SJA-15;f__SJA-15;g__SJA-15;s__uncultured_Chloroflexi
OTU365 OTU365 d__Bacteria;p__Firmicutes;c__Clostridia;oLachnospirales;;;
OTU383 OTU383 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Alteromonadaceae;;
OTU685 OTU685 d__Bacteria;p__PAUC34f;c__PAUC34f;o__PAUC34f;f__PAUC34f;g__PAUC34f;s__uncultured_bacterium
OTU708 OTU708 d__Bacteria;p__Patescibacteria;c__Gracilibacteria;o__Gracilibacteria;f__Gracilibacteria;g__Gracilibacteria;s__uncultured_organism
OTU1120 OTU1120 d__Bacteria;p__Calditrichota;c__Calditrichia;o__Calditrichales;f__Calditrichaceae;gCaldithrix;

The separation between groups is 1.96.

We update the plot with the separations:

intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)

Resultados %>%
  kbl() %>%
  kable_styling()
Size Separation
LR 95 1.981986
glmnet.ext.3 42 1.492428
glmnet.ext.5 53 1.203924
glmnet.ext.10 59 1.120839
intersection 10 1.960728

6.6 CYM T2C vs CYM T2HW

tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[13:24,]
Samples=tax_otu$ID 
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")

DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs

taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]
indices=1:n.OTUs.original

hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))

Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
            Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))  
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)  
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL

Miseria=as.vector(which(colSums(DF.0)<=2|colSums(DF.0.hits)==1))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]
indices=indices[-Miseria]

row.names(DF.0)=Samples

DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF.0=DF.0[7:12,]
DF=DF[7:12,]
DF.0.original=DF.0.original[7:12 ,]

Type=as.factor(c(rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors=c("purple","red")[Type]

After removing OTUs with 2 or fewer reads across the CAU samples and those appearing in only one of these samples, 2247 taxa remain.

The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:

Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
  kbl() %>%
  kable_styling()
Sample OTUs Reads Proportions
CYM_T2C_1 OTU2121 186 0.0020402
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)

6.6.1 How different are the samples?

HC=ClusterHC(DF,Grups=Type,colores=colors)

base=Sep(HC)

The two sample types are perfectly separated from the start. The separation between groups is 1.21.

We examine the proportion of OTU subsets of reasonable size that already classify them correctly.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1714640507
initial_seed %% 10000
# [1] 507
set.seed(507)
Experimento=function(i)
{
  ii=sample(dim(DF)[2],i,rep=FALSE)
  DFtemp=DF[,ii]
  CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
  c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:1000)
  {
  print(i)
  EE=replicate(n,Experimento(i))
  EE.2=EE[2,which(EE[1,]==0)]
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
  F.SS=rbind(F.SS, c(i,
                     length(EE.2)/250,
                      mean(EE.2),
                      quantile(XX,0.025),
                      quantile(XX,0.975)
  ))
  }
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CYM.T2CvsT2HW.RData")
F.SS=readRDS("Fraccio.Separadors.CYM.T2CvsT2HW.RData")

The first plot shows, for each \(n\) up to 600, the proportion of samples of size \(n\) that perfectly classify CYM_T2C and CYM_T2HW (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.

F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)


plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",xlim=c(0,1000),xaxp=c(0,1000,20),yaxp=c(0,1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CYM_T2C and CYM_T2HW (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).

fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,1000),xaxp=c(0,1000,20),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
abline(h=base,col="green")

`

6.6.2 Taxa with maximum relevant log-ratios

LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CYM_2C2HW.RData")
LRS=readRDS("LRS_CYM_2C2HW.RData")
assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
    assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}
saveRDS(assoc, file="assoc_CYM_2C2HW.RData")
assoc=readRDS("assoc_CYM_2C2HW.RData")
plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)

m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))

HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)

LR.max=c(length(impLR),Sep(HC))

The maximum is attained at the set of 563 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.15.

Significant=data.frame(
  OTUs=paste("OTU",1:n.OTUs.original,sep=""),
  taxa=taxa.original,
  Log_ratio.max=Indicador(impLR)
  )

6.6.3 Taxa relevant for the bacterial signature (according to coda_glmnet)

coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL

coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.

# 3 replicates
set.seed(5693)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_2,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
  paste("CYM_T2C_2",1:M,sep="_"),
  paste("CYM_T2C_3",1:M,sep="_"),
  paste("CYM_T2HW_1",1:M,sep="_"),
  paste("CYM_T2HW_2",1:M,sep="_"),
  paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_3copies.RData")
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])

coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)

impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))

With 3 replicates, 81 significant taxa are found, perfectly separating the two groups, with a separation of 3.67.

Significant=cbind(Significant,
glmnet_sign.ext.3.T2C=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T2HW=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(5693)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_2,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
  paste("CYM_T2C_2",1:M,sep="_"),
  paste("CYM_T2C_3",1:M,sep="_"),
  paste("CYM_T2HW_1",1:M,sep="_"),
  paste("CYM_T2HW_2",1:M,sep="_"),
  paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_5copies.RData")
LogRat_mostres[[1]]=LogRat_mostres[[1]][c(1:30,37:42),]
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])

coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)

impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))

With 5 replicates, 84 significant taxa are found, perfectly separating the two groups, with a separation of 3.4.

Significant=cbind(Significant,
glmnet_sign.ext.5.T2C=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T2HW=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
set.seed(5693)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T2C_1,repls$CYM_T2C_2,repls$CYM_T2C_3,repls$CYM_T2HW_1,repls$CYM_T2HW_1,repls$CYM_T2HW_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T2C_1",1:M,sep="_"),
  paste("CYM_T2C_2",1:M,sep="_"),
  paste("CYM_T2C_3",1:M,sep="_"),
  paste("CYM_T2HW_1",1:M,sep="_"),
  paste("CYM_T2HW_2",1:M,sep="_"),
  paste("CYM_T2HW_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T2C", 3*M),rep("CYM_T2HW", 3*M),rep("CYM_T2C", 3),rep("CYM_T2HW", 3)))
colors_ext=c("purple","red")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_2C2HW_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_2C2HW_10copies.RData")
LogRat_mostres[[1]]=LogRat_mostres[[1]][c(1:60,67:72),]
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])

coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,taxa,showPlots = FALSE)

impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))

With 10 replicates, 82 significant taxa are found, perfectly separating the two groups, with a separation of 3.09.

Significant=cbind(Significant,
glmnet_sign.ext.10.T2C=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T2HW=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))

6.6.4 OTUs significant according to ALDeX

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1711815195
initial_seed %% 10000
# [1] 5195
set.seed(5195)
repl_kw=aldexCesc.clr(t(DF), Type=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(repl_kw, file="repl_kw_2C2HW.RData")
saveRDS(aldex_kw, file="aldex_kw_2C2HW.RData")
p.valores_kw=readRDS("aldex_kw_2C2HW.RData") 
length(p.valores_kw[p.valores_kw[,1]<0.01,1])
## [1] 0
length(p.valores_kw[p.valores_kw[,1]<0.05,1])
## [1] 29
length(p.valores_kw[p.valores_kw[,2]<0.01,2])
## [1] 0
length(p.valores_kw[p.valores_kw[,2]<0.01,2])
## [1] 0
sep.kw.noadjust.0.05=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=TRUE)

There are no taxa with a Kruskal–Wallis adjusted p-value < 0.05. There are 29 with an unadjusted p-value < 0.05 (none < 0.01); in the absence of a better option, we select them. They perfectly separate the two groups, with a separation of 3.11.

Significant=cbind(Significant,
Aldex.kw.noadjust.0.05=Indicador(sep.kw.noadjust.0.05))

6.7 Simper

DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CYM_T2C_CYM_T2HW$ord[which(SMP.prop$CYM_T2C_CYM_T2HW$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CYM_T2C_CYM_T2HW$cusum[which(SMP.prop$CYM_T2C_CYM_T2HW$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)
simper.props=c(length(indexos_SMP),Sep(HC))

We obtain 120 significant taxa, which perfectly separate the two sample types.They include OTU2121, which we had previously removed because it appeared in only one sample. The separation between groups is 1.42.

Indicador.Gl=function(x,k=n.OTUs.original){
  xx=rep(0,k)
 xx[x]=1
  return(xx)
}
Significant=cbind(Significant,
simper.rel=Indicador.Gl(indexos_SMP))
Significant=cbind(Significant,
t(DF.prop))

6.7.1 Summary

write.csv2(Significant,"OTU_Significativos_CYM_T2.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10,
sep.kw.noadjust.0.05,
simper.props
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"Aldex",
"simper")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[,1],F.SS[,3],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS[,4],cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS[,5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,6,20),ylim=c(0,6))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)

Finally, we intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates

Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

There are 60 taxa. They are:

Significant.capat[,1:2]%>%
  kbl() %>%
  kable_styling() 
OTUs taxa
OTU4 OTU4 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Robiginitalea;s__bacterium_AMSU
OTU6 OTU6 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Maritimimonas;s__uncultured_Bacteroidetes
OTU19 OTU19 d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;guncultured;
OTU25 OTU25 d__Archaea;p__Asgardarchaeota;c__Heimdallarchaeia;o__Heimdallarchaeia;f__Heimdallarchaeia;g__Heimdallarchaeia;s__uncultured_archaeon
OTU26 OTU26 d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__SBR1031;f__SBR1031;g__SBR1031;s__uncultured_organism
OTU27 OTU27 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Ulvibacter;s__Aureitalea_sp.
OTU31 OTU31 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;oEctothiorhodospirales;;;
OTU34 OTU34 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gMaribacter;
OTU48 OTU48 d__Bacteria;p__Patescibacteria;c__Parcubacteria;;;;
OTU56 OTU56 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Arenicellales;f__Arenicellaceae;guncultured;
OTU78 OTU78 d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;gSpirochaeta;
OTU85 OTU85 d__Bacteria;pChloroflexi;;;;;
OTU94 OTU94 d__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;g__Spirochaeta;s__uncultured_bacterium
OTU102 OTU102 d__Bacteria;p__Bacteroidota;c__Ignavibacteria;o__Ignavibacteriales;f__Melioribacteraceae;g__IheB3-7;s__uncultured_Chlorobi
OTU107 OTU107 d__Bacteria;p__Actinobacteriota;c__WCHB1-81;o__WCHB1-81;f__WCHB1-81;gWCHB1-81;
OTU118 OTU118 d__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Marine_Benthic_Group_D_and_DHVEG-1;f__Marine_Benthic_Group_D_and_DHVEG-1;g__Marine_Benthic_Group_D_and_DHVEG-1;s__uncultured_archaeon
OTU123 OTU123 d__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Pirellulales;f__Pirellulaceae;gPir4_lineage;
OTU126 OTU126 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;guncultured;
OTU133 OTU133 d__Bacteria;p__Desulfobacterota;c__Desulfobacteria;o__Desulfatiglandales;f__Desulfatiglandaceae;gDesulfatiglans;
OTU136 OTU136 d__Bacteria;p__Patescibacteria;c__Parcubacteria;o__Candidatus_Kaiserbacteria;f__Candidatus_Kaiserbacteria;g__Candidatus_Kaiserbacteria;s__uncultured_bacterium
OTU137 OTU137 d__Archaea;p__Thermoplasmatota;c__Thermoplasmata;o__Marine_Benthic_Group_D_and_DHVEG-1;f__Marine_Benthic_Group_D_and_DHVEG-1;gMarine_Benthic_Group_D_and_DHVEG-1;
OTU143 OTU143 d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__Bathyarchaeia;f__Bathyarchaeia;gBathyarchaeia;
OTU144 OTU144 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gAquibacter;
OTU163 OTU163 d__Bacteria;p__Modulibacteria;c__Moduliflexia;o__Moduliflexales;f__Moduliflexaceae;g__Moduliflexaceae;s__uncultured_bacterium
OTU168 OTU168 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Thiomicrospirales;f__Thiomicrospiraceae;gendosymbionts;
OTU174 OTU174 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__SB-5;g__SB-5;s__bacterium_YC-LK-LKJ31
OTU178 OTU178 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;gWinogradskyella;
OTU181 OTU181 d__Bacteria;p__Chloroflexi;c__Anaerolineae;;;;
OTU182 OTU182 d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__uncultured;f__uncultured;guncultured;
OTU186 OTU186 d__Bacteria;p__LCP-89;c__LCP-89;o__LCP-89;f__LCP-89;g__LCP-89;s__uncultured_bacterium
OTU200 OTU200 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;;
OTU212 OTU212 d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;g__uncultured;s__uncultured_bacterium
OTU216 OTU216 d__Bacteria;p__Actinobacteriota;c__WCHB1-81;o__WCHB1-81;f__WCHB1-81;g__WCHB1-81;s__uncultured_bacterium
OTU224 OTU224 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Sedimenticolaceae;g__uncultured;s__uncultured_bacterium
OTU241 OTU241 d__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;g__uncultured;s__uncultured_Chloroflexi
OTU283 OTU283 d__Bacteria;p__Calditrichota;c__Calditrichia;o__Calditrichales;f__Calditrichaceae;;
OTU296 OTU296 d__Bacteria;p__Patescibacteria;c__Parcubacteria;o__Candidatus_Kaiserbacteria;f__Candidatus_Kaiserbacteria;g__Candidatus_Kaiserbacteria;s__uncultured_prokaryote
OTU309 OTU309 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;gLewinella;
OTU329 OTU329 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Flavobacteriaceae;g__Aquibacter;s__uncultured_marine
OTU347 OTU347 d__Bacteria;p__Chloroflexi;c__Dehalococcoidia;;;;
OTU372 OTU372 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Pseudohongiellaceae;gPseudohongiella;
OTU412 OTU412 d__Bacteria;p__Desulfobacterota;c__Desulfobulbia;oDesulfobulbales;;;
OTU426 OTU426 d__Bacteria;p__Verrucomicrobiota;c__Omnitrophia;o__Omnitrophales;f__Omnitrophaceae;g__Candidatus_Omnitrophus;s__uncultured_bacterium
OTU433 OTU433 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Lewinella;s__uncultured_marine
OTU490 OTU490 d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;gPeredibacter;
OTU516 OTU516 d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bdellovibrionales;f__Bdellovibrionaceae;gBdellovibrio;
OTU565 OTU565 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Portibacter;s__uncultured_Bacteroidetes
OTU574 OTU574 d__Bacteria;p__Planctomycetota;c__Planctomycetes;o__Pirellulales;f__Pirellulaceae;g__uncultured;s__uncultured_organism
OTU597 OTU597 d__Bacteria;p__Acidobacteriota;c__Thermoanaerobaculia;o__Thermoanaerobaculales;f__Thermoanaerobaculaceae;g__Subgroup_23;s__uncultured_bacterium
OTU622 OTU622 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__uncultured;g__uncultured;s__uncultured_Bacteroidetes
OTU625 OTU625 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__pItb-vmat-80;f__pItb-vmat-80;gpItb-vmat-80;
OTU659 OTU659 d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__Mitochondria;g__Mitochondria;s__uncultured_bacterium
OTU785 OTU785 d__Archaea;p__Asgardarchaeota;c__Odinarchaeia;o__Odinarchaeia;f__Odinarchaeia;g__Odinarchaeia;s__uncultured_crenarchaeote
OTU854 OTU854 d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Opitutales;f__Puniceicoccaceae;gLentimonas;
OTU980 OTU980 d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Chitinophagales;f__Saprospiraceae;g__Phaeodactylibacter;s__uncultured_organism
OTU1088 OTU1088 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Halieaceae;gCongregibacter;
OTU1502 OTU1502 d__Archaea;p__Thermoplasmatota;c__Thermoplasmatota;o__Thermoplasmatota;f__Thermoplasmatota;guncultured;
OTU1532 OTU1532 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Chromatiaceae;g__Candidatus_Thiobios;s__uncultured_organism
OTU1539 OTU1539 d__Bacteria;p__Acidobacteriota;c__Vicinamibacteria;o__Subgroup_17;f__Subgroup_17;gSubgroup_17;
OTU2370 OTU2370 d__Bacteria;p__Bdellovibrionota;c__Bdellovibrionia;o__Bacteriovoracales;f__Bacteriovoracaceae;g__uncultured;s__uncultured_deep-sea

The separation between groups is 4.31.

We update the plot with the separations:

intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"Aldex",
"simper",
"Intersection")
colnames(Resultados)=c("Size","Separation")
plot(F.SS[-1,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,150),xaxp=c(0,150,15),yaxp=c(0,6,12),ylim=c(0,6))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados[-dim(Resultados)[1],],col="blue",type="h")
points(Resultados[-dim(Resultados)[1],],col="blue",pch=20)
points(intersección[1],intersección[2],col="purple",type="h")
points(intersección[1],intersección[2],col="purple",pch=20)
text(Resultados[-dim(Resultados)[1],1],Resultados[-dim(Resultados)[1],2]+0.1,col="blue",labels=row.names(Resultados)[-dim(Resultados)[1]],cex=0.75)
text(intersección[1],intersección[2]+0.1,col="purple",labels="intersección",cex=0.75)

abline(h=base,col="green")
text(20,base+0.1,col="green",labels="Global",cex=0.75)
Resultados %>%
  kbl() %>%
  kable_styling()
Size Separation
LR 563 2.150788
glmnet.ext.3 81 3.668474
glmnet.ext.5 84 3.399211
glmnet.ext.10 82 3.085298
Aldex 29 3.114325
simper 120 1.415174
Intersection 60 4.306244

6.8 CAU T0 vs CAU T1

tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[1:12,]

Samples=tax_otu$ID 
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")

DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs

taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]

Miseria=as.vector(which(colSums(DF.0[1:6,])<=3))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]

hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))

Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
            Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))  
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)  
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL


DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]

row.names(DF.0)=Samples

DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[1:6,]
DF.0=DF.0[1:6,]
DF.0.original=DF.0.original[1:6 ,]

Type=as.factor(c(rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors=c("blue","green")[Type]

After removing OTUs with 2 or fewer reads across the samples and those appearing in only one of these samples, 1194 taxa remain.

The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:

Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
  kbl() %>%
  kable_styling()
Sample OTUs Reads Proportions
CAU_T0_2 OTU966 55 0.0012226
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)

6.8.1 How different are the samples?

HC=ClusterHC(DF,Grups=Type,colores=colors)

base=Sep(HC)

The two sample types are not initially separated.

We now examine the proportion of OTU subsets of reasonable size that already classify them correctly.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717749309
initial_seed %% 10000
# [1] 9309
set.seed(9309)
Experimento=function(i)
{
  ii=sample(dim(DF)[2],i,rep=FALSE)
  DFtemp=DF[,ii]
  CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
  c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=500
F.SS=c()
for (i in 10:600) 
  {
  print(i)
  EE=replicate(n,Experimento(i))
  EE.2=EE[2,which(EE[1,]==0)]
  if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
  F.SS=rbind(F.SS, c(i,
                     length(EE.2)/250,
                      mean(EE.2),
                      quantile(XX,0.025),
                      quantile(XX,0.975)
  ))}
    else {
    F.SS=rbind(F.SS, c(i,
                     0,
                      NA,
                      NA,
                      NA))
  }
  }
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CAU.T0vsT1.RData")
F.SS=readRDS("Fraccio.Separadors.CAU.T0vsT1.RData")

The first plot shows, for each \(n\) up to 500, the proportion of samples of size \(n\) that perfectly classify CAU_T0 and CAU_T1 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.

F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)


plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CAU_T0 and CAU_T1 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).

fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,600),xaxp=c(0,600,12),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

`

6.8.2 Taxa with maximum relevant log-ratios

LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CAU_T0T1.RData")
LRS=readRDS("LRS_CAU_T0T1.RData")

assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
    assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}

plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)

m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))

HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)

LR.max=c(length(impLR),Sep(HC))

The maximum is attained at the set of 24 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.3.

Significant=data.frame(
  OTUs=paste("OTU",1:n.OTUs.original,sep=""),
  taxa=taxa.original,
  Log_ratio.max=Indicador(impLR))

6.8.3 Taxa relevant for the bacterial signature (according to coda_glmnet)

coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL

coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1]  1717764843
initial_seed %% 10000
# [1]  4843
# 3 replicates
set.seed(4843)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
  paste("CAU_T0_2",1:M,sep="_"),
  paste("CAU_T0_3",1:M,sep="_"),
  paste("CAU_T1_1",1:M,sep="_"),
  paste("CAU_T1_2",1:M,sep="_"),
  paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_3copies.RData")
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)

impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))

With 3 replicates, 45 significant taxa are found, perfectly separating the two groups, with a separation of 0.95.

Significant=cbind(Significant,
glmnet_sign.ext.3.T0=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T1=Indicador(coda.glmnet.3$taxa.num[coda.glmnet.3$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(4843)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
  paste("CAU_T0_2",1:M,sep="_"),
  paste("CAU_T0_3",1:M,sep="_"),
  paste("CAU_T1_1",1:M,sep="_"),
  paste("CAU_T1_2",1:M,sep="_"),
  paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_5copies.RData")
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)

impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))

With 5 replicates, 51 significant taxa are found, perfectly separating the two groups, with a separation of 0.88.

Significant=cbind(Significant,
glmnet_sign.ext.5.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 10 replicates
set.seed(4843)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CAU_T0_1,repls$CAU_T0_2,repls$CAU_T0_3,repls$CAU_T1_1,repls$CAU_T1_2,repls$CAU_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CAU_T0_1",1:M,sep="_"),
  paste("CAU_T0_2",1:M,sep="_"),
  paste("CAU_T0_3",1:M,sep="_"),
  paste("CAU_T1_1",1:M,sep="_"),
  paste("CAU_T1_2",1:M,sep="_"),
  paste("CAU_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CAU_T0", 3*M),rep("CAU_T1", 3*M),rep("CAU_T0", 3),rep("CAU_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CAU_T0T1_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CAU_T0T1_10copies.RData")
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)

impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))

With 10 replicates, 49 significant taxa are found, but they do not classify well the samples.

Significant=cbind(Significant,
glmnet_sign.ext.10.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))

6.8.4 OTUs significant according to ALDeX

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717772151
initial_seed %% 10000
# [1] 2151
set.seed(2151)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
saveRDS(aldex_kw, file="aldex_kw_CAU_T0T1.RData")
p.valores_kw=readRDS("aldex_kw_CAU_T0T1.RData") 
length(p.valores_kw[p.valores_kw[,1]<0.05,2])
## [1] 0

There are no taxa with a p-value < 0.05, even without adjustment.

6.9 Simper

The significant taxa identified by simper do not classify the samples well:

DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CAU_T0_CAU_T1$ord[which(SMP.prop$CAU_T0_CAU_T1$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CAU_T0_CAU_T1$cusum[which(SMP.prop$CAU_T0_CAU_T1$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

Significant=cbind(Significant,
t(DF.prop))

6.9.1 Summary

write.csv2(Significant,"OTU_Significativos_CAU_T0T1.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10
)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),ylim=c(0,max(Resultados[,2])+0.1))


lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)

We intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates

Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
Significant.capat[,1:2]%>%
  kbl() %>%
  kable_styling() 
OTUs taxa
OTU952 OTU952 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Beggiatoales;f__Beggiatoaceae;g__Thioflexothrix;s__Thiotrichales_bacterium

A single OTU. The same happens with 5 replicates:

Significant.capat=Significant[,c(1,2,3,6,7)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]

Significant.capat[,1:2]%>%
  kbl() %>%
  kable_styling() 
OTUs taxa
OTU828 OTU828 d__Bacteria;p__Verrucomicrobiota;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Rubritaleaceae;g__Roseibacillus;s__uncultured_organism
Resultados %>%
  kbl() %>%
  kable_styling()
Size Separation
LR 24 2.2969800
glmnet.ext.3 45 0.9462113
glmnet.ext.5 51 0.8765157
glmnet.ext.10 49 0.8640208

6.10 CYM T0 vs CYM T1

tax_otu=as.data.frame(read_excel("TablaOTUs.xlsx"))[13:24,]

Samples=tax_otu$ID 
taxa=colnames(tax_otu[,3:dim(tax_otu)[2]])
OTUs=paste("OTU",1:(dim(tax_otu)[2]-2),sep="")

DF.0=tax_otu[,-c(1,2)]
row.names(DF.0)=Samples
colnames(DF.0)=OTUs

taxa.original=taxa
DF.0.original=DF.0
n.OTUs.original=dim(DF.0)[2]

Miseria=as.vector(which(colSums(DF.0[1:6,])<=3))
DF.0=DF.0[,-Miseria]
taxa=taxa[-Miseria]
OTUs=OTUs[-Miseria]

hit=function(x){min(c(x,1))}
DF.0.hits=as.matrix(DF.0)
DF.0.hits[]=vapply(DF.0.hits, hit, numeric(1))

Unics=data.frame(
Sample=rep(NA,length(which(colSums(DF.0.hits)==1))), OTUs=names(colSums(DF.0)[which(colSums(DF.0.hits)==1)]), Reads=colSums(DF.0)[which(colSums(DF.0.hits)==1)],
            Proportions=rep(0,length(which(colSums(DF.0.hits)==1)))  
)
for (i in 1:length(Unics$OTUs)){
ii=which(DF.0.hits[,Unics$OTUs[i]]==1)  
Unics$Proportions[i]=DF.0[ii,Unics$OTUs[i]]/sum(DF.0[ii,])
Unics$Sample[i]=Samples[ii]}
Unics.Freqs=Unics[Unics$Proportions>=0.05/100,]
row.names(Unics.Freqs)=NULL

DF.0=DF.0[,-which(colSums(DF.0.hits)==1)]
taxa=taxa[-which(colSums(DF.0.hits)==1)]
OTUs=OTUs[-which(colSums(DF.0.hits)==1)]

row.names(DF.0)=Samples

DF=zCompositions::cmultRepl(DF.0,method="GBM",output="p-counts",suppress.print=TRUE,z.warning=0.99)
DF=DF[1:6,]
DF.0=DF.0[1:6,]
DF.0.original=DF.0.original[1:6 ,]

Type=as.factor(c(rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors=c("blue","green")[Type]

After removing OTUs with 2 or fewer reads across the samples and those appearing in only one of these samples, 1658 taxa remain.

The OTUs removed at this step that represent more than 0.05% of the sample in which they appear are:

Unics.Freqs[rev(order(Unics.Freqs$Proportions)),] %>%
  kbl() %>%
  kable_styling()
Sample OTUs Reads Proportions
rm(tax_otu)
rm(DF.0.hits)
rm(Miseria)
rm(Unics.Freqs)

6.10.1 How different are the samples?

HC=ClusterHC(DF,Grups=Type,colores=colors)

base=Sep(HC)

The two sample types are not initially separated.

We now examine the proportion of OTU subsets of reasonable size that classify them correctly.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717668665
initial_seed %% 10000
# [1] 8665
set.seed(8665)
Experimento=function(i)
{
  ii=sample(dim(DF)[2],i,rep=FALSE)
  DFtemp=DF[,ii]
  CC=ClusterHC(DFtemp,dendrograma=FALSE,barplot=FALSE, Grups=Type,colores=colors)
  c(Encerts(CC$tabla,table(Type)),Sep(CC))
}
n=250
F.SS=c()
for (i in 10:650) #10
  {
  print(i)
  EE=replicate(n,Experimento(i))
  EE.2=EE[2,which(EE[1,]==0)]
  if(length(which(EE[1,]==0))>0){
XX=replicate(1000,mean(sample(EE.2,n,replace=TRUE)))
  F.SS=rbind(F.SS, c(i,
                     length(EE.2)/250,
                      mean(EE.2),
                      quantile(XX,0.025),
                      quantile(XX,0.975)
  ))}
    else {
    F.SS=rbind(F.SS, c(i,
                     0,
                      NA,
                      NA,
                      NA))
  }
  }
colnames(F.SS)=c("n","Proportion","Avg. sep.","Q_0.025","Q_0.975")
saveRDS(F.SS, file="Fraccio.Separadors.CYM.T0vsT1.RData")
F.SS=readRDS("Fraccio.Separadors.CYM.T0vsT1.RData")

The first plot shows, for each \(n\) up to 500, the proportion of samples of size \(n\) that perfectly classify CYM_T0 and CYM_T1 (black curve), along with the 95% Clopper–Pearson confidence interval for that proportion (red curves). We used smoothing splines to smooth the curves.

F.SS_prop.Q_0.025=epitools::binom.exact(F.SS[,2]*250,250)$lower
F.SS_prop.Q_0.975=epitools::binom.exact(F.SS[,2]*250,250)$upper
fit = smooth.spline(F.SS[,1],F.SS[,2],cv=TRUE)
fit.low = smooth.spline(F.SS[,1],F.SS_prop.Q_0.025,cv=TRUE)
fit.up = smooth.spline(F.SS[,1],F.SS_prop.Q_0.975,cv=TRUE)


plot(F.SS[,1:2],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Proportion",main="Proportion of perfect classifiers",ylim=c(0,0.1),xlim=c(0,650),xaxp=c(0,650,13),yaxp=c(0,0.1,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

The second plot shows, for each \(n\) up to 600, the estimated mean separation obtained with a sample of size \(n\) that perfectly classifies CYM_T0 and CYM_T1 (black curve), along with the 95% confidence interval of this mean separation obtained via bootstrap (red curve).

fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),yaxp=c(0,1.5,15))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)

`

6.10.2 Taxa with maximum relevant log-ratios

LRS=explore_logratios(DF,Type)
saveRDS(LRS, file="LRS_CYM_T0T1.RData")
LRS=readRDS("LRS_CYM_T0T1.RData")

assoc=rep(0,dim(DF)[2]-4)
for (m in 5:dim(DF)[2]){
    assoc[m-4]=sum(LRS$`association log-ratio with y`[1:m,1:m])/m^2
}

plot(5:dim(DF)[2],assoc,type="b",xlab="m",ylab="Average goodness of the m most relevant." ,pch=20,cex=0.5)

m=4+which.max(assoc)
impLR=sort(as.numeric(LRS$`order of importance`[1:m]))

HC=ClusterHC(DF[,impLR],barplot=FALSE,Grups=Type,colores=colors)

LR.max=c(length(impLR),Sep(HC))

The maximum is attained at the set of 169 most logratio-relevant taxa.
They perfectly separate the two sample types. The separation between groups is 2.46.

Significant=data.frame(
  OTUs=paste("OTU",1:n.OTUs.original,sep=""),
  taxa=taxa.original,
  Log_ratio.max=Indicador(impLR))

6.10.3 Taxa relevant for the bacterial signature (according to coda_glmnet)

coda.glmnet=coda_glmnet(DF,Type,showPlots = FALSE)
## [1] "No variables are selected. The prediction and the signature plots are not displayed."
coda.glmnet$`signature plot`
## NULL

coda_glmnet does not find taxa with significant differences. We add random replicates. We test with 3, 5, and 10 replicates.

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1]  1717709647
initial_seed %% 10000
# [1]  9647
# 3 replicates
set.seed(9647)
M=3
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
  paste("CYM_T0_2",1:M,sep="_"),
  paste("CYM_T0_3",1:M,sep="_"),
  paste("CYM_T1_1",1:M,sep="_"),
  paste("CYM_T1_2",1:M,sep="_"),
  paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_3copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_3copies.RData")
coda.glmnet.3=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)

impGLMNET_ext.3=coda.glmnet.3$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.3],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.3=c(length(impGLMNET_ext.3),Sep(HC))

With 3 replicates, 62 significant taxa are found, perfectly separating the two groups, with a separation of 2.05.

Significant=cbind(Significant,
glmnet_sign.ext.3.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.3.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 5 replicates
set.seed(9647)
M=5
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
  paste("CYM_T0_2",1:M,sep="_"),
  paste("CYM_T0_3",1:M,sep="_"),
  paste("CYM_T1_1",1:M,sep="_"),
  paste("CYM_T1_2",1:M,sep="_"),
  paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_5copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_5copies.RData")
coda.glmnet.5=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)

impGLMNET_ext.5=coda.glmnet.5$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.5],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.5=c(length(impGLMNET_ext.5),Sep(HC))

With 5 replicates, 74 significant taxa are found, perfectly separating the two groups, with a separation of 1.36.

Significant=cbind(Significant,
glmnet_sign.ext.5.T0=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`>0]),
glmnet_sign.ext.5.T1=Indicador(coda.glmnet.5$taxa.num[coda.glmnet.5$`log-contrast coefficients`<0]))
# 10 replicates
set.seed(9647)
M=10
repls=aldexCesc.clr(t(DF), conds=Type, mc.samples=M)@analysisData
## [1] "operating in serial mode"
repls=t(cbind(repls$CYM_T0_1,repls$CYM_T0_2,repls$CYM_T0_3,repls$CYM_T1_1,repls$CYM_T1_2,repls$CYM_T1_3))
attr(repls,"dimnames")[[2]]=NULL
colnames(repls)=colnames(DF)
row.names(repls)=c(paste("CYM_T0_1",1:M,sep="_"),
  paste("CYM_T0_2",1:M,sep="_"),
  paste("CYM_T0_3",1:M,sep="_"),
  paste("CYM_T1_1",1:M,sep="_"),
  paste("CYM_T1_2",1:M,sep="_"),
  paste("CYM_T1_3",1:M,sep="_"))
mostres=rbind(repls,easyCODA::CLR(DF)$LR)
rm(repls)
Type_ext=as.factor(c(rep("CYM_T0", 3*M),rep("CYM_T1", 3*M),rep("CYM_T0", 3),rep("CYM_T1", 3)))
colors_ext=c("blue","green")[Type_ext]
LogRat_mostres=logratios_matrix_clr(mostres)
LogRat_mostres=list(LogRat_mostres[[1]],LogRat_mostres[[2]])
saveRDS(LogRat_mostres, file="LogRat_mostres_CYM_T0T1_10copies.RData")
LogRat_mostres=readRDS("LogRat_mostres_CYM_T0T1_10copies.RData")
coda.glmnet.10=coda_glmnet_lr_bin(LogRat_mostres,Type_ext,DF,showPlots = FALSE)

impGLMNET_ext.10=coda.glmnet.10$taxa.num
HC=ClusterHC(DF[,impGLMNET_ext.10],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

glmnet.ext.10=c(length(impGLMNET_ext.10),Sep(HC))

With 10 replicates, 67 significant taxa are found, perfectly separating the two groups, with a separation of 1.26.

Significant=cbind(Significant,
glmnet_sign.ext.10.T0=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`>0]),
glmnet_sign.ext.10.T1=Indicador(coda.glmnet.10$taxa.num[coda.glmnet.10$`log-contrast coefficients`<0]))

6.10.4 OTUs significant according to ALDeX

> initial_seed=as.integer(Sys.time())
> print (initial_seed)
# [1] 1717738742
initial_seed %% 10000
# [1] 8742
set.seed(8742)
repl_kw=aldexCesc.clr(t(DF), conds=Type, mc.samples=1000, verbose=FALSE)
aldex_kw=aldex.kw(repl_kw, verbose=FALSE)
#
repl_glm=aldexCesc.clr(t(DF), conds=model.matrix(~Type,data.frame(Type)), mc.samples=1000, verbose=FALSE)
aldex_glm=aldex.glm(repl_glm, model.matrix(~Type,data.frame(Type)))
#'
saveRDS(aldex_kw, file="aldex_kw_CYM_T0T1.RData")
saveRDS(aldex_glm, file="aldex_glm_CYM_T0T1.RData")
p.valores_kw=readRDS("aldex_kw_CYM_T0T1.RData") 
length(p.valores_kw[p.valores_kw[,2]<0.05,2])
## [1] 0
length(p.valores_kw[p.valores_kw[,4]<0.05,2])
## [1] 88
length(p.valores_kw[p.valores_kw[,4]<0.01,4])
## [1] 6

There are no OTUs with an adjusted Kruskal–Wallis p-value < 0.05. There are 66 OTUs with an unadjusted p-value < 0.05, but they do not separate the samples well:

sep.kw.noadjust.0.05=QQ.HC.noadjust(p.valores_kw,DF,Type,q=0.05,1,BP=FALSE)

Significant=cbind(Significant,
Aldex.kw.noadjust.0.05=Indicador(sep.kw.noadjust.0.05))

6.10.5 Simper

The significant taxa identified by simper do not classify the samples well:

DF.prop=DF.0.original/rowSums(DF.0.original)
SMP.prop=simper(DF.prop,group=Type)
indexos_SMP=SMP.prop$CYM_T0_CYM_T1$ord[which(SMP.prop$CYM_T0_CYM_T1$cusum<=0.7)]
OTUs_SMP=sort(attr(SMP.prop$CYM_T0_CYM_T1$cusum[which(SMP.prop$CYM_T0_CYM_T1$cusum<=0.7)],"names"))
OTUs_SMP.good=setdiff(OTUs_SMP,Unics$OTUs)
HC=ClusterHC(DF[,OTUs_SMP.good],dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

Significant=cbind(Significant,
t(DF.prop))

6.10.6 Summary

write.csv2(Significant,"OTU_Significativos_CYM_T0T1.csv",row.names=FALSE)
Resultados=rbind(
LR.max,
glmnet.ext.3,
glmnet.ext.5,
glmnet.ext.10)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10"
)
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",xlim=c(0,650),xaxp=c(0,650,13),ylim=c(0,max(Resultados[,2])+0.1))


lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.01,col="blue",labels=row.names(Resultados),cex=0.75)

We intersect the OTUs identified as significant according to LR and coda_glmnet with 3 replicates.

Significant.capat=Significant[,c(1,2,3,4,5)]
Significant.capat=cbind(Significant.capat,Cuánto=rowSums(Significant.capat[,3:dim(Significant.capat)[2]]))
Significant.capat=Significant.capat[Significant.capat$Cuánto==2,]
DF.Imp=DF[,Significant.capat[,1]]
HC=ClusterHC(DF.Imp,dendrograma=TRUE,barplot=FALSE,Grups=Type,colores=colors)

There are 1 taxa. They are:

Significant.capat[,1:2]%>%
  kbl() %>%
  kable_styling() 
OTUs taxa
OTU235 OTU235 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Thiomicrospirales;f__Thiomicrospiraceae;g__endosymbionts;s__uncultured_bacterium

The separation between groups is 0.98.

We update the plot with the separations:

intersección=c(dim(Significant.capat)[1],Sep(HC))
Resultados=rbind(Resultados,intersección)
row.names(Resultados)=c(
"LR",
"glmnet.ext.3",
"glmnet.ext.5",
"glmnet.ext.10",
"intersection")
colnames(Resultados)=c("Size","Separation")
par(opar)
fit = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),3],cv=TRUE)
fit.low = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),4],cv=TRUE)
fit.up = smooth.spline(F.SS[!is.na(F.SS[,3]),1],F.SS[!is.na(F.SS[,3]),5],cv=TRUE)

plot(F.SS[,c(1,3)],pch=20,cex=0.25,col="darkgrey",xlab="n",ylab="Separation",main="Average separation for perfect classifiers",ylim=c(0,2.25),xlim=c(0,250),xaxp=c(0,250,10),yaxp=c(0,2.5,10))
lines(fit,col="black",lwd=1)
lines(fit.low,col="red",lwd=1)
lines(fit.up,col="red",lwd=1)
points(Resultados,col="blue",type="h")
points(Resultados,col="blue",pch=20)
text(Resultados[,1],Resultados[,2]+0.02,col="blue",labels=row.names(Resultados),cex=0.75)
abline(h=2.47,col="green")
text(20,2.6,col="green",labels="Global",cex=0.75)
Resultados %>%
  kbl() %>%
  kable_styling()